Files
bookmarksite/tests/test_enhanced_duplicate_detection.html
Rainer Koschnick 0abee5b794 Add comprehensive database setup and user management system
- Implement PostgreSQL database schema with users and bookmarks tables
- Add database connection pooling with retry logic and error handling
- Create migration system with automatic schema initialization
- Add database CLI tools for management (init, status, validate, etc.)
- Include comprehensive error handling and diagnostics
- Add development seed data and testing utilities
- Implement health monitoring and connection pool statistics
- Create detailed documentation and troubleshooting guide

Database features:
- Users table with authentication fields and email verification
- Bookmarks table with user association and metadata
- Proper indexes for performance optimization
- Automatic timestamp triggers
- Transaction support with rollback handling
- Connection pooling (20 max connections, 30s idle timeout)
- Graceful shutdown handling

CLI commands available:
- npm run db:init - Initialize database
- npm run db:status - Check database status
- npm run db:validate - Validate schema
- npm run db:test - Run database tests
- npm run db:diagnostics - Full diagnostics
2025-07-19 23:21:50 +02:00

407 lines
18 KiB
HTML

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Enhanced Duplicate Detection Test</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.test-section { margin: 20px 0; padding: 15px; border: 1px solid #ddd; }
.test-result { margin: 10px 0; padding: 10px; background: #f5f5f5; }
.pass { background: #d4edda; color: #155724; }
.fail { background: #f8d7da; color: #721c24; }
</style>
</head>
<body>
<h1>Enhanced Duplicate Detection Test</h1>
<div class="test-section">
<h2>Test Data</h2>
<p>Testing with sample bookmarks that should be detected as duplicates:</p>
<ul>
<li>Exact URL duplicates</li>
<li>URL variants (with/without query params)</li>
<li>Similar titles (fuzzy matching)</li>
</ul>
</div>
<div id="testResults"></div>
<script>
// Mock BookmarkManager class with just the duplicate detection methods
class TestBookmarkManager {
constructor() {
this.bookmarks = [
// Exact URL duplicates
{ id: 1, title: "Google", url: "https://www.google.com", addDate: Date.now() - 86400000 },
{ id: 2, title: "Google Search", url: "https://google.com/", addDate: Date.now() },
// URL variants
{ id: 3, title: "GitHub", url: "https://github.com", addDate: Date.now() - 172800000 },
{ id: 4, title: "GitHub Home", url: "https://github.com?tab=repositories", addDate: Date.now() },
// Similar titles
{ id: 5, title: "JavaScript Tutorial", url: "https://example1.com", addDate: Date.now() - 259200000 },
{ id: 6, title: "Javascript Tutorials", url: "https://example2.com", addDate: Date.now() },
// Different bookmarks (should not be duplicates)
{ id: 7, title: "Stack Overflow", url: "https://stackoverflow.com", addDate: Date.now() },
{ id: 8, title: "MDN Web Docs", url: "https://developer.mozilla.org", addDate: Date.now() }
];
}
// Copy the enhanced methods from the main implementation
normalizeUrl(url, options = {}) {
const {
removeQueryParams = false,
removeFragment = false,
removeWWW = true,
removeTrailingSlash = true,
sortQueryParams = true,
removeDefaultPorts = true,
removeCommonTracking = false
} = options;
try {
const urlObj = new URL(url);
let normalized = urlObj.protocol.toLowerCase() + '//';
let hostname = urlObj.hostname.toLowerCase();
if (removeWWW && hostname.startsWith('www.')) {
hostname = hostname.substring(4);
}
normalized += hostname;
if (removeDefaultPorts) {
if ((urlObj.protocol === 'http:' && urlObj.port && urlObj.port !== '80') ||
(urlObj.protocol === 'https:' && urlObj.port && urlObj.port !== '443')) {
normalized += ':' + urlObj.port;
}
} else if (urlObj.port) {
normalized += ':' + urlObj.port;
}
let pathname = urlObj.pathname;
if (removeTrailingSlash && pathname !== '/' && pathname.endsWith('/')) {
pathname = pathname.slice(0, -1);
}
normalized += pathname;
if (!removeQueryParams && urlObj.search) {
const params = new URLSearchParams(urlObj.search);
if (removeCommonTracking) {
const trackingParams = [
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'gclid', 'fbclid', 'msclkid', 'ref', 'source', 'campaign',
'_ga', '_gid', 'mc_cid', 'mc_eid', 'yclid'
];
trackingParams.forEach(param => params.delete(param));
}
if (params.toString()) {
if (sortQueryParams) {
const sortedParams = new URLSearchParams();
[...params.keys()].sort().forEach(key => {
params.getAll(key).forEach(value => {
sortedParams.append(key, value);
});
});
normalized += '?' + sortedParams.toString();
} else {
normalized += '?' + params.toString();
}
}
}
if (!removeFragment && urlObj.hash) {
normalized += urlObj.hash;
}
return normalized;
} catch (error) {
console.warn('URL normalization failed for:', url, error);
return url.toLowerCase().trim();
}
}
levenshteinDistance(str1, str2) {
const matrix = [];
for (let i = 0; i <= str2.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= str1.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= str2.length; i++) {
for (let j = 1; j <= str1.length; j++) {
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j] + 1
);
}
}
}
return matrix[str2.length][str1.length];
}
calculateSimilarity(str1, str2) {
if (str1 === str2) return 1;
if (!str1 || !str2) return 0;
const maxLength = Math.max(str1.length, str2.length);
if (maxLength === 0) return 1;
const distance = this.levenshteinDistance(str1.toLowerCase(), str2.toLowerCase());
return (maxLength - distance) / maxLength;
}
normalizeTitle(title) {
return title
.toLowerCase()
.replace(/[^\w\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
findUrlDuplicates() {
const urlMap = new Map();
this.bookmarks.forEach(bookmark => {
const normalizedUrl = this.normalizeUrl(bookmark.url, {
removeQueryParams: false,
removeFragment: false,
removeWWW: true,
removeTrailingSlash: true,
sortQueryParams: true,
removeCommonTracking: false
});
if (urlMap.has(normalizedUrl)) {
urlMap.get(normalizedUrl).push(bookmark);
} else {
urlMap.set(normalizedUrl, [bookmark]);
}
});
return Array.from(urlMap.values()).filter(group => group.length > 1);
}
findUrlVariantDuplicates(processedBookmarks) {
const baseUrlMap = new Map();
this.bookmarks
.filter(bookmark => !processedBookmarks.has(bookmark.id))
.forEach(bookmark => {
const baseUrl = this.normalizeUrl(bookmark.url, {
removeQueryParams: true,
removeFragment: true,
removeWWW: true,
removeTrailingSlash: true
});
if (baseUrlMap.has(baseUrl)) {
baseUrlMap.get(baseUrl).push(bookmark);
} else {
baseUrlMap.set(baseUrl, [bookmark]);
}
});
return Array.from(baseUrlMap.values()).filter(group => group.length > 1);
}
findTitleDuplicates(processedBookmarks) {
const titleGroups = [];
const remainingBookmarks = this.bookmarks.filter(bookmark => !processedBookmarks.has(bookmark.id));
const processedTitles = new Set();
remainingBookmarks.forEach((bookmark, index) => {
if (processedTitles.has(bookmark.id)) return;
const normalizedTitle = this.normalizeTitle(bookmark.title);
const similarBookmarks = [bookmark];
for (let i = index + 1; i < remainingBookmarks.length; i++) {
const otherBookmark = remainingBookmarks[i];
if (processedTitles.has(otherBookmark.id)) continue;
const otherNormalizedTitle = this.normalizeTitle(otherBookmark.title);
const similarity = this.calculateSimilarity(normalizedTitle, otherNormalizedTitle);
if (similarity > 0.8 && Math.abs(normalizedTitle.length - otherNormalizedTitle.length) < 20) {
similarBookmarks.push(otherBookmark);
processedTitles.add(otherBookmark.id);
}
}
if (similarBookmarks.length > 1) {
const avgSimilarity = similarBookmarks.reduce((sum, bookmark, idx) => {
if (idx === 0) return sum;
return sum + this.calculateSimilarity(normalizedTitle, this.normalizeTitle(bookmark.title));
}, 0) / (similarBookmarks.length - 1);
titleGroups.push({
bookmarks: similarBookmarks,
confidence: Math.round(avgSimilarity * 100) / 100
});
similarBookmarks.forEach(bookmark => processedTitles.add(bookmark.id));
}
});
return titleGroups;
}
async detectDuplicates() {
const duplicateGroups = [];
const processedBookmarks = new Set();
// Strategy 1: Exact URL matches
const urlGroups = this.findUrlDuplicates();
urlGroups.forEach(group => {
if (group.length > 1) {
duplicateGroups.push({
type: 'exact_url',
reason: 'Identical URLs',
bookmarks: group,
confidence: 1.0
});
group.forEach(bookmark => processedBookmarks.add(bookmark.id));
}
});
// Strategy 2: URL variants
const urlVariantGroups = this.findUrlVariantDuplicates(processedBookmarks);
urlVariantGroups.forEach(group => {
if (group.length > 1) {
duplicateGroups.push({
type: 'url_variant',
reason: 'Same URL with different parameters/fragments',
bookmarks: group,
confidence: 0.9
});
group.forEach(bookmark => processedBookmarks.add(bookmark.id));
}
});
// Strategy 3: Fuzzy title matching
const titleGroups = this.findTitleDuplicates(processedBookmarks);
titleGroups.forEach(group => {
if (group.length > 1) {
duplicateGroups.push({
type: 'fuzzy_title',
reason: 'Similar titles',
bookmarks: group.bookmarks,
confidence: group.confidence
});
group.bookmarks.forEach(bookmark => processedBookmarks.add(bookmark.id));
}
});
return duplicateGroups;
}
}
// Run tests
async function runTests() {
const manager = new TestBookmarkManager();
const resultsDiv = document.getElementById('testResults');
try {
console.log('Starting enhanced duplicate detection tests...');
// Test URL normalization
const testUrls = [
['https://www.google.com/', 'https://google.com'],
['https://github.com?tab=repositories', 'https://github.com'],
['https://example.com#section', 'https://example.com']
];
let urlNormalizationPassed = true;
testUrls.forEach(([url1, url2]) => {
const normalized1 = manager.normalizeUrl(url1, { removeWWW: true, removeTrailingSlash: true });
const normalized2 = manager.normalizeUrl(url2, { removeWWW: true, removeTrailingSlash: true });
if (normalized1 !== normalized2) {
urlNormalizationPassed = false;
console.log(`URL normalization failed: ${url1} -> ${normalized1}, ${url2} -> ${normalized2}`);
}
});
// Test similarity calculation
const similarity1 = manager.calculateSimilarity('JavaScript Tutorial', 'Javascript Tutorials');
const similarity2 = manager.calculateSimilarity('Completely Different', 'Another Thing');
const similarityPassed = similarity1 > 0.8 && similarity2 < 0.5;
// Test duplicate detection
const duplicateGroups = await manager.detectDuplicates();
let duplicateDetectionPassed = true;
let expectedGroups = 3; // Should find 3 groups: exact URLs, URL variants, similar titles
if (duplicateGroups.length !== expectedGroups) {
duplicateDetectionPassed = false;
console.log(`Expected ${expectedGroups} duplicate groups, found ${duplicateGroups.length}`);
}
// Display results
resultsDiv.innerHTML = `
<div class="test-section">
<h2>Test Results</h2>
<div class="test-result ${urlNormalizationPassed ? 'pass' : 'fail'}">
URL Normalization: ${urlNormalizationPassed ? 'PASS' : 'FAIL'}
</div>
<div class="test-result ${similarityPassed ? 'pass' : 'fail'}">
Similarity Calculation: ${similarityPassed ? 'PASS' : 'FAIL'}
(JS Tutorial similarity: ${similarity1.toFixed(2)}, Different strings: ${similarity2.toFixed(2)})
</div>
<div class="test-result ${duplicateDetectionPassed ? 'pass' : 'fail'}">
Duplicate Detection: ${duplicateDetectionPassed ? 'PASS' : 'FAIL'}
(Found ${duplicateGroups.length} groups)
</div>
</div>
<div class="test-section">
<h2>Detected Duplicate Groups</h2>
${duplicateGroups.map((group, index) => `
<div class="test-result">
<strong>Group ${index + 1}: ${group.reason}</strong>
(Type: ${group.type}, Confidence: ${group.confidence})
<ul>
${group.bookmarks.map(bookmark =>
`<li>${bookmark.title} - ${bookmark.url}</li>`
).join('')}
</ul>
</div>
`).join('')}
</div>
`;
console.log('Tests completed successfully');
} catch (error) {
console.error('Test failed:', error);
resultsDiv.innerHTML = `
<div class="test-result fail">
<strong>Test Error:</strong> ${error.message}
</div>
`;
}
}
// Run tests when page loads
window.addEventListener('load', runTests);
</script>
</body>
</html>
</content>