- Implement PostgreSQL database schema with users and bookmarks tables - Add database connection pooling with retry logic and error handling - Create migration system with automatic schema initialization - Add database CLI tools for management (init, status, validate, etc.) - Include comprehensive error handling and diagnostics - Add development seed data and testing utilities - Implement health monitoring and connection pool statistics - Create detailed documentation and troubleshooting guide Database features: - Users table with authentication fields and email verification - Bookmarks table with user association and metadata - Proper indexes for performance optimization - Automatic timestamp triggers - Transaction support with rollback handling - Connection pooling (20 max connections, 30s idle timeout) - Graceful shutdown handling CLI commands available: - npm run db:init - Initialize database - npm run db:status - Check database status - npm run db:validate - Validate schema - npm run db:test - Run database tests - npm run db:diagnostics - Full diagnostics
407 lines
18 KiB
HTML
407 lines
18 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Enhanced Duplicate Detection Test</title>
|
|
<style>
|
|
body { font-family: Arial, sans-serif; margin: 20px; }
|
|
.test-section { margin: 20px 0; padding: 15px; border: 1px solid #ddd; }
|
|
.test-result { margin: 10px 0; padding: 10px; background: #f5f5f5; }
|
|
.pass { background: #d4edda; color: #155724; }
|
|
.fail { background: #f8d7da; color: #721c24; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>Enhanced Duplicate Detection Test</h1>
|
|
|
|
<div class="test-section">
|
|
<h2>Test Data</h2>
|
|
<p>Testing with sample bookmarks that should be detected as duplicates:</p>
|
|
<ul>
|
|
<li>Exact URL duplicates</li>
|
|
<li>URL variants (with/without query params)</li>
|
|
<li>Similar titles (fuzzy matching)</li>
|
|
</ul>
|
|
</div>
|
|
|
|
<div id="testResults"></div>
|
|
|
|
<script>
|
|
// Mock BookmarkManager class with just the duplicate detection methods
|
|
class TestBookmarkManager {
|
|
constructor() {
|
|
this.bookmarks = [
|
|
// Exact URL duplicates
|
|
{ id: 1, title: "Google", url: "https://www.google.com", addDate: Date.now() - 86400000 },
|
|
{ id: 2, title: "Google Search", url: "https://google.com/", addDate: Date.now() },
|
|
|
|
// URL variants
|
|
{ id: 3, title: "GitHub", url: "https://github.com", addDate: Date.now() - 172800000 },
|
|
{ id: 4, title: "GitHub Home", url: "https://github.com?tab=repositories", addDate: Date.now() },
|
|
|
|
// Similar titles
|
|
{ id: 5, title: "JavaScript Tutorial", url: "https://example1.com", addDate: Date.now() - 259200000 },
|
|
{ id: 6, title: "Javascript Tutorials", url: "https://example2.com", addDate: Date.now() },
|
|
|
|
// Different bookmarks (should not be duplicates)
|
|
{ id: 7, title: "Stack Overflow", url: "https://stackoverflow.com", addDate: Date.now() },
|
|
{ id: 8, title: "MDN Web Docs", url: "https://developer.mozilla.org", addDate: Date.now() }
|
|
];
|
|
}
|
|
|
|
// Copy the enhanced methods from the main implementation
|
|
normalizeUrl(url, options = {}) {
|
|
const {
|
|
removeQueryParams = false,
|
|
removeFragment = false,
|
|
removeWWW = true,
|
|
removeTrailingSlash = true,
|
|
sortQueryParams = true,
|
|
removeDefaultPorts = true,
|
|
removeCommonTracking = false
|
|
} = options;
|
|
|
|
try {
|
|
const urlObj = new URL(url);
|
|
|
|
let normalized = urlObj.protocol.toLowerCase() + '//';
|
|
|
|
let hostname = urlObj.hostname.toLowerCase();
|
|
if (removeWWW && hostname.startsWith('www.')) {
|
|
hostname = hostname.substring(4);
|
|
}
|
|
normalized += hostname;
|
|
|
|
if (removeDefaultPorts) {
|
|
if ((urlObj.protocol === 'http:' && urlObj.port && urlObj.port !== '80') ||
|
|
(urlObj.protocol === 'https:' && urlObj.port && urlObj.port !== '443')) {
|
|
normalized += ':' + urlObj.port;
|
|
}
|
|
} else if (urlObj.port) {
|
|
normalized += ':' + urlObj.port;
|
|
}
|
|
|
|
let pathname = urlObj.pathname;
|
|
if (removeTrailingSlash && pathname !== '/' && pathname.endsWith('/')) {
|
|
pathname = pathname.slice(0, -1);
|
|
}
|
|
normalized += pathname;
|
|
|
|
if (!removeQueryParams && urlObj.search) {
|
|
const params = new URLSearchParams(urlObj.search);
|
|
|
|
if (removeCommonTracking) {
|
|
const trackingParams = [
|
|
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
'gclid', 'fbclid', 'msclkid', 'ref', 'source', 'campaign',
|
|
'_ga', '_gid', 'mc_cid', 'mc_eid', 'yclid'
|
|
];
|
|
trackingParams.forEach(param => params.delete(param));
|
|
}
|
|
|
|
if (params.toString()) {
|
|
if (sortQueryParams) {
|
|
const sortedParams = new URLSearchParams();
|
|
[...params.keys()].sort().forEach(key => {
|
|
params.getAll(key).forEach(value => {
|
|
sortedParams.append(key, value);
|
|
});
|
|
});
|
|
normalized += '?' + sortedParams.toString();
|
|
} else {
|
|
normalized += '?' + params.toString();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!removeFragment && urlObj.hash) {
|
|
normalized += urlObj.hash;
|
|
}
|
|
|
|
return normalized;
|
|
} catch (error) {
|
|
console.warn('URL normalization failed for:', url, error);
|
|
return url.toLowerCase().trim();
|
|
}
|
|
}
|
|
|
|
levenshteinDistance(str1, str2) {
|
|
const matrix = [];
|
|
|
|
for (let i = 0; i <= str2.length; i++) {
|
|
matrix[i] = [i];
|
|
}
|
|
|
|
for (let j = 0; j <= str1.length; j++) {
|
|
matrix[0][j] = j;
|
|
}
|
|
|
|
for (let i = 1; i <= str2.length; i++) {
|
|
for (let j = 1; j <= str1.length; j++) {
|
|
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
|
|
matrix[i][j] = matrix[i - 1][j - 1];
|
|
} else {
|
|
matrix[i][j] = Math.min(
|
|
matrix[i - 1][j - 1] + 1,
|
|
matrix[i][j - 1] + 1,
|
|
matrix[i - 1][j] + 1
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
return matrix[str2.length][str1.length];
|
|
}
|
|
|
|
calculateSimilarity(str1, str2) {
|
|
if (str1 === str2) return 1;
|
|
if (!str1 || !str2) return 0;
|
|
|
|
const maxLength = Math.max(str1.length, str2.length);
|
|
if (maxLength === 0) return 1;
|
|
|
|
const distance = this.levenshteinDistance(str1.toLowerCase(), str2.toLowerCase());
|
|
return (maxLength - distance) / maxLength;
|
|
}
|
|
|
|
normalizeTitle(title) {
|
|
return title
|
|
.toLowerCase()
|
|
.replace(/[^\w\s]/g, '')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
findUrlDuplicates() {
|
|
const urlMap = new Map();
|
|
|
|
this.bookmarks.forEach(bookmark => {
|
|
const normalizedUrl = this.normalizeUrl(bookmark.url, {
|
|
removeQueryParams: false,
|
|
removeFragment: false,
|
|
removeWWW: true,
|
|
removeTrailingSlash: true,
|
|
sortQueryParams: true,
|
|
removeCommonTracking: false
|
|
});
|
|
|
|
if (urlMap.has(normalizedUrl)) {
|
|
urlMap.get(normalizedUrl).push(bookmark);
|
|
} else {
|
|
urlMap.set(normalizedUrl, [bookmark]);
|
|
}
|
|
});
|
|
|
|
return Array.from(urlMap.values()).filter(group => group.length > 1);
|
|
}
|
|
|
|
findUrlVariantDuplicates(processedBookmarks) {
|
|
const baseUrlMap = new Map();
|
|
|
|
this.bookmarks
|
|
.filter(bookmark => !processedBookmarks.has(bookmark.id))
|
|
.forEach(bookmark => {
|
|
const baseUrl = this.normalizeUrl(bookmark.url, {
|
|
removeQueryParams: true,
|
|
removeFragment: true,
|
|
removeWWW: true,
|
|
removeTrailingSlash: true
|
|
});
|
|
|
|
if (baseUrlMap.has(baseUrl)) {
|
|
baseUrlMap.get(baseUrl).push(bookmark);
|
|
} else {
|
|
baseUrlMap.set(baseUrl, [bookmark]);
|
|
}
|
|
});
|
|
|
|
return Array.from(baseUrlMap.values()).filter(group => group.length > 1);
|
|
}
|
|
|
|
findTitleDuplicates(processedBookmarks) {
|
|
const titleGroups = [];
|
|
const remainingBookmarks = this.bookmarks.filter(bookmark => !processedBookmarks.has(bookmark.id));
|
|
const processedTitles = new Set();
|
|
|
|
remainingBookmarks.forEach((bookmark, index) => {
|
|
if (processedTitles.has(bookmark.id)) return;
|
|
|
|
const normalizedTitle = this.normalizeTitle(bookmark.title);
|
|
const similarBookmarks = [bookmark];
|
|
|
|
for (let i = index + 1; i < remainingBookmarks.length; i++) {
|
|
const otherBookmark = remainingBookmarks[i];
|
|
if (processedTitles.has(otherBookmark.id)) continue;
|
|
|
|
const otherNormalizedTitle = this.normalizeTitle(otherBookmark.title);
|
|
const similarity = this.calculateSimilarity(normalizedTitle, otherNormalizedTitle);
|
|
|
|
if (similarity > 0.8 && Math.abs(normalizedTitle.length - otherNormalizedTitle.length) < 20) {
|
|
similarBookmarks.push(otherBookmark);
|
|
processedTitles.add(otherBookmark.id);
|
|
}
|
|
}
|
|
|
|
if (similarBookmarks.length > 1) {
|
|
const avgSimilarity = similarBookmarks.reduce((sum, bookmark, idx) => {
|
|
if (idx === 0) return sum;
|
|
return sum + this.calculateSimilarity(normalizedTitle, this.normalizeTitle(bookmark.title));
|
|
}, 0) / (similarBookmarks.length - 1);
|
|
|
|
titleGroups.push({
|
|
bookmarks: similarBookmarks,
|
|
confidence: Math.round(avgSimilarity * 100) / 100
|
|
});
|
|
|
|
similarBookmarks.forEach(bookmark => processedTitles.add(bookmark.id));
|
|
}
|
|
});
|
|
|
|
return titleGroups;
|
|
}
|
|
|
|
async detectDuplicates() {
|
|
const duplicateGroups = [];
|
|
const processedBookmarks = new Set();
|
|
|
|
// Strategy 1: Exact URL matches
|
|
const urlGroups = this.findUrlDuplicates();
|
|
urlGroups.forEach(group => {
|
|
if (group.length > 1) {
|
|
duplicateGroups.push({
|
|
type: 'exact_url',
|
|
reason: 'Identical URLs',
|
|
bookmarks: group,
|
|
confidence: 1.0
|
|
});
|
|
group.forEach(bookmark => processedBookmarks.add(bookmark.id));
|
|
}
|
|
});
|
|
|
|
// Strategy 2: URL variants
|
|
const urlVariantGroups = this.findUrlVariantDuplicates(processedBookmarks);
|
|
urlVariantGroups.forEach(group => {
|
|
if (group.length > 1) {
|
|
duplicateGroups.push({
|
|
type: 'url_variant',
|
|
reason: 'Same URL with different parameters/fragments',
|
|
bookmarks: group,
|
|
confidence: 0.9
|
|
});
|
|
group.forEach(bookmark => processedBookmarks.add(bookmark.id));
|
|
}
|
|
});
|
|
|
|
// Strategy 3: Fuzzy title matching
|
|
const titleGroups = this.findTitleDuplicates(processedBookmarks);
|
|
titleGroups.forEach(group => {
|
|
if (group.length > 1) {
|
|
duplicateGroups.push({
|
|
type: 'fuzzy_title',
|
|
reason: 'Similar titles',
|
|
bookmarks: group.bookmarks,
|
|
confidence: group.confidence
|
|
});
|
|
group.bookmarks.forEach(bookmark => processedBookmarks.add(bookmark.id));
|
|
}
|
|
});
|
|
|
|
return duplicateGroups;
|
|
}
|
|
}
|
|
|
|
// Run tests
|
|
async function runTests() {
|
|
const manager = new TestBookmarkManager();
|
|
const resultsDiv = document.getElementById('testResults');
|
|
|
|
try {
|
|
console.log('Starting enhanced duplicate detection tests...');
|
|
|
|
// Test URL normalization
|
|
const testUrls = [
|
|
['https://www.google.com/', 'https://google.com'],
|
|
['https://github.com?tab=repositories', 'https://github.com'],
|
|
['https://example.com#section', 'https://example.com']
|
|
];
|
|
|
|
let urlNormalizationPassed = true;
|
|
testUrls.forEach(([url1, url2]) => {
|
|
const normalized1 = manager.normalizeUrl(url1, { removeWWW: true, removeTrailingSlash: true });
|
|
const normalized2 = manager.normalizeUrl(url2, { removeWWW: true, removeTrailingSlash: true });
|
|
|
|
if (normalized1 !== normalized2) {
|
|
urlNormalizationPassed = false;
|
|
console.log(`URL normalization failed: ${url1} -> ${normalized1}, ${url2} -> ${normalized2}`);
|
|
}
|
|
});
|
|
|
|
// Test similarity calculation
|
|
const similarity1 = manager.calculateSimilarity('JavaScript Tutorial', 'Javascript Tutorials');
|
|
const similarity2 = manager.calculateSimilarity('Completely Different', 'Another Thing');
|
|
|
|
const similarityPassed = similarity1 > 0.8 && similarity2 < 0.5;
|
|
|
|
// Test duplicate detection
|
|
const duplicateGroups = await manager.detectDuplicates();
|
|
|
|
let duplicateDetectionPassed = true;
|
|
let expectedGroups = 3; // Should find 3 groups: exact URLs, URL variants, similar titles
|
|
|
|
if (duplicateGroups.length !== expectedGroups) {
|
|
duplicateDetectionPassed = false;
|
|
console.log(`Expected ${expectedGroups} duplicate groups, found ${duplicateGroups.length}`);
|
|
}
|
|
|
|
// Display results
|
|
resultsDiv.innerHTML = `
|
|
<div class="test-section">
|
|
<h2>Test Results</h2>
|
|
<div class="test-result ${urlNormalizationPassed ? 'pass' : 'fail'}">
|
|
URL Normalization: ${urlNormalizationPassed ? 'PASS' : 'FAIL'}
|
|
</div>
|
|
<div class="test-result ${similarityPassed ? 'pass' : 'fail'}">
|
|
Similarity Calculation: ${similarityPassed ? 'PASS' : 'FAIL'}
|
|
(JS Tutorial similarity: ${similarity1.toFixed(2)}, Different strings: ${similarity2.toFixed(2)})
|
|
</div>
|
|
<div class="test-result ${duplicateDetectionPassed ? 'pass' : 'fail'}">
|
|
Duplicate Detection: ${duplicateDetectionPassed ? 'PASS' : 'FAIL'}
|
|
(Found ${duplicateGroups.length} groups)
|
|
</div>
|
|
</div>
|
|
|
|
<div class="test-section">
|
|
<h2>Detected Duplicate Groups</h2>
|
|
${duplicateGroups.map((group, index) => `
|
|
<div class="test-result">
|
|
<strong>Group ${index + 1}: ${group.reason}</strong>
|
|
(Type: ${group.type}, Confidence: ${group.confidence})
|
|
<ul>
|
|
${group.bookmarks.map(bookmark =>
|
|
`<li>${bookmark.title} - ${bookmark.url}</li>`
|
|
).join('')}
|
|
</ul>
|
|
</div>
|
|
`).join('')}
|
|
</div>
|
|
`;
|
|
|
|
console.log('Tests completed successfully');
|
|
|
|
} catch (error) {
|
|
console.error('Test failed:', error);
|
|
resultsDiv.innerHTML = `
|
|
<div class="test-result fail">
|
|
<strong>Test Error:</strong> ${error.message}
|
|
</div>
|
|
`;
|
|
}
|
|
}
|
|
|
|
// Run tests when page loads
|
|
window.addEventListener('load', runTests);
|
|
</script>
|
|
</body>
|
|
</html>
|
|
</content> |