import { pool } from '../db.js'; import { readFileSync, createReadStream } from 'fs'; import { createGunzip } from 'zlib'; import { pipeline } from 'stream/promises'; import { Writable } from 'stream'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; const __dirname = dirname(fileURLToPath(import.meta.url)); const SQL_DIR = join(__dirname, '..', '..', 'sql'); /** * Table name mapping from SQL dumps to our schema */ const TABLE_MAPPING = { 'public.regions': 'public.continents', 'regions': 'public.continents', 'public.subregions': 'public.subregions', }; /** * Transform subregions INSERT statements to match migration schema * SQL dump: INSERT INTO subregions VALUES (id, name, translations, region_id, created_at, updated_at, flag, wikiDataId) * Migration: id, name, continent_id, translations, created_at, updated_at, flag, wiki_data_id */ function transformSubregionsInsert(stmt) { // Extract VALUES from the INSERT statement const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is); if (!valuesMatch) return stmt; const valuesStr = valuesMatch[1]; // Parse values carefully (handling JSON with commas inside) const values = []; let current = ''; let depth = 0; let inString = false; let stringChar = ''; for (let i = 0; i < valuesStr.length; i++) { const char = valuesStr[i]; const prevChar = i > 0 ? valuesStr[i - 1] : ''; if (!inString && (char === "'" || char === '"')) { inString = true; stringChar = char; } else if (inString && char === stringChar && prevChar !== '\\') { // Check for escaped quotes (doubled) if (valuesStr[i + 1] === stringChar) { current += char; i++; // Skip next quote } else { inString = false; } } if (!inString) { if (char === '(' || char === '{' || char === '[') depth++; if (char === ')' || char === '}' || char === ']') depth--; if (char === ',' && depth === 0) { values.push(current.trim()); current = ''; continue; } } current += char; } if (current.trim()) values.push(current.trim()); // Reorder: [id, name, translations, region_id, created, updated, flag, wikiDataId] // To: [id, name, continent_id, translations, created, updated, flag, wiki_data_id] if (values.length >= 8) { const [id, name, translations, region_id, created, updated, flag, wikiDataId] = values; const reordered = [id, name, region_id, translations, created, updated, flag, wikiDataId]; return `INSERT INTO subregions (id, name, continent_id, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${reordered.join(', ')});`; } return stmt; } /** * Transform countries INSERT statements to match migration schema * SQL dump columns (32): id, name, iso3, numeric_code, iso2, phonecode, capital, currency, currency_name, * currency_symbol, tld, native, population, gdp, region, region_id, subregion, subregion_id, nationality, * area_sq_km, postal_code_format, postal_code_regex, timezones, translations, latitude, longitude, * emoji, emojiU, created_at, updated_at, flag, wikiDataId * Migration columns (25): id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name, * currency_symbol, tld, native, continent_id, subregion_id, nationality, latitude, longitude, * emoji, emoji_u, timezones, translations, created_at, updated_at, flag, wiki_data_id */ function transformCountriesInsert(stmt) { const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is); if (!valuesMatch) return stmt; const valuesStr = valuesMatch[1]; const values = parseValues(valuesStr); if (values.length >= 32) { // Extract the 32 columns from dump const [id, name, iso3, numeric_code, iso2, phonecode, capital, currency, currency_name, currency_symbol, tld, native, population, gdp, region, region_id, subregion, subregion_id, nationality, area_sq_km, postal_code_format, postal_code_regex, timezones, translations, latitude, longitude, emoji, emojiU, created_at, updated_at, flag, wikiDataId] = values; // Map to our 25-column schema (region_id becomes continent_id) const mapped = [id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name, currency_symbol, tld, native, region_id, subregion_id, nationality, latitude, longitude, emoji, emojiU, timezones, translations, created_at, updated_at, flag, wikiDataId]; return `INSERT INTO countries (id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name, currency_symbol, tld, native, continent_id, subregion_id, nationality, latitude, longitude, emoji, emoji_u, timezones, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`; } return stmt; } /** * Transform states INSERT statements to match migration schema * The SQL dump has 20 columns, migration has 15 columns. * Instead of parsing VALUES, we add explicit column list and use a subselect to remap. * This avoids issues with complex JSON/Unicode parsing. */ function transformStatesInsert(stmt) { // Just return the original statement - we need to handle this at the table level // by adding a view or adjusting the schema // For now, let's try a different approach: extract each value position individually const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is); if (!valuesMatch) return stmt; const valuesStr = valuesMatch[1]; // Use regex to find value positions based on comma counting outside quotes const values = parseValues(valuesStr); // Debug: Log if we have issues if (values.length !== 20) { console.log(` ⚠️ Expected 20 columns for states, got ${values.length}`); return stmt; } // Dump indices: 0:id, 1:name, 2:country_id, 3:country_code, 4:fips_code, 5:iso2, // 6:iso3166_2, 7:type, 8:level, 9:parent_id, 10:native, 11:latitude, 12:longitude, // 13:timezone, 14:translations, 15:created_at, 16:updated_at, 17:flag, 18:wikiDataId, 19:population // // Migration needs: id, name, country_id, country_code, iso2, fips_code, type, latitude, // longitude, timezone, translations, created_at, updated_at, flag, wiki_data_id // // Mapping: 0, 1, 2, 3, 5, 4, 7, 11, 12, 13, 14, 15, 16, 17, 18 const mapped = [ values[0], // id values[1], // name values[2], // country_id values[3], // country_code values[5], // iso2 values[4], // fips_code values[7], // type values[11], // latitude values[12], // longitude values[13], // timezone values[14], // translations values[15], // created_at values[16], // updated_at values[17], // flag values[18] // wikiDataId -> wiki_data_id ]; return `INSERT INTO states (id, name, country_id, country_code, iso2, fips_code, type, latitude, longitude, timezone, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`; } /** * Transform cities INSERT statements to match migration schema * SQL dump columns (19): id, name, state_id, state_code, country_id, country_code, type, level, parent_id, * latitude, longitude, native, population, timezone, translations, created_at, updated_at, flag, wikiDataId * Migration columns (15): id, name, state_id, state_code, country_id, country_code, latitude, longitude, * population, timezone, translations, created_at, updated_at, flag, wiki_data_id */ function transformCitiesInsert(stmt) { const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is); if (!valuesMatch) return stmt; const valuesStr = valuesMatch[1]; const values = parseValues(valuesStr); if (values.length !== 19) { // Skip transformation if column count doesn't match expected return stmt; } // Dump indices: 0:id, 1:name, 2:state_id, 3:state_code, 4:country_id, 5:country_code, // 6:type, 7:level, 8:parent_id, 9:latitude, 10:longitude, 11:native, 12:population, // 13:timezone, 14:translations, 15:created_at, 16:updated_at, 17:flag, 18:wikiDataId // // Migration needs: id, name, state_id, state_code, country_id, country_code, latitude, // longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id // // Mapping: 0, 1, 2, 3, 4, 5, 9, 10, 12, 13, 14, 15, 16, 17, 18 const mapped = [ values[0], // id values[1], // name values[2], // state_id values[3], // state_code values[4], // country_id values[5], // country_code values[9], // latitude values[10], // longitude values[12], // population values[13], // timezone values[14], // translations values[15], // created_at values[16], // updated_at values[17], // flag values[18] // wikiDataId -> wiki_data_id ]; return `INSERT INTO cities (id, name, state_id, state_code, country_id, country_code, latitude, longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`; } /** * Helper to parse VALUES string with proper handling of nested JSON and quoted strings * This handles SQL values like: 1, 'name', '{"br": "value"}', NULL */ function parseValues(valuesStr) { const values = []; let current = ''; let depth = 0; let inString = false; let stringChar = ''; for (let i = 0; i < valuesStr.length; i++) { const char = valuesStr[i]; const prevChar = i > 0 ? valuesStr[i - 1] : ''; // Only start a new string if we're not already in one if (!inString && (char === "'")) { inString = true; stringChar = char; } else if (inString && char === stringChar && prevChar !== '\\') { // Check for escaped quotes (doubled like '' in SQL) if (valuesStr[i + 1] === stringChar) { current += char; i++; // Skip next quote } else { inString = false; } } // Track depth for parentheses/brackets (only outside strings) if (!inString) { if (char === '(' || char === '{' || char === '[') depth++; if (char === ')' || char === '}' || char === ']') depth--; if (char === ',' && depth === 0) { values.push(current.trim()); current = ''; continue; } } current += char; } if (current.trim()) values.push(current.trim()); return values; } /** * Execute a SQL file directly */ async function executeSqlFile(filename, tableName) { const filePath = join(SQL_DIR, filename); console.log(` 📄 Loading ${filename}...`); try { let sql = readFileSync(filePath, 'utf8'); // Clean up postgres-specific commands that might cause issues // These need to match ONLY standalone commands, not content inside VALUES sql = sql .replace(/\\restrict[^\n]*/g, '') .replace(/\\unrestrict[^\n]*/g, '') .replace(/^SELECT pg_catalog\.setval[^;]*;/gm, '') .replace(/^ALTER TABLE[^;]*OWNER TO[^;]*;/gm, '') .replace(/^COMMENT ON[^;]*;/gm, '') .replace(/^SET [a-z_]+\s*=/gmi, (match) => '-- ' + match) // Comment out SET statements .replace(/^SET [a-z_]+;$/gmi, (match) => '-- ' + match) // Comment out simple SET statements .replace(/^SELECT[^;]*set_config[^;]*;/gm, ''); // Extract only INSERT statements const insertStatements = sql.match(/INSERT INTO[^;]+;/g) || []; if (insertStatements.length === 0) { console.log(` ⚠️ No INSERT statements found in ${filename}`); return 0; } // Execute each INSERT statement for (const stmt of insertStatements) { // Convert MySQL column names to PostgreSQL (camelCase -> snake_case for some) let pgStmt = stmt .replace(/`/g, '"') .replace(/"emojiU"/g, 'emoji_u') .replace(/"wikiDataId"/g, 'wiki_data_id'); // Apply table name mapping for (const [oldName, newName] of Object.entries(TABLE_MAPPING)) { pgStmt = pgStmt.replace(new RegExp(`INSERT INTO ${oldName}`, 'gi'), `INSERT INTO ${newName}`); } // Apply special transformations for subregions (column reordering) if (pgStmt.includes('INSERT INTO subregions')) { pgStmt = transformSubregionsInsert(pgStmt); } // Apply special transformations for countries (32 cols -> 25 cols) if (pgStmt.includes('INSERT INTO countries') || pgStmt.includes('INSERT INTO public.countries')) { pgStmt = transformCountriesInsert(pgStmt); } // Apply special transformations for states (20 cols -> 15 cols) if (pgStmt.includes('INSERT INTO states') || pgStmt.includes('INSERT INTO public.states')) { pgStmt = transformStatesInsert(pgStmt); } // prevent duplicate key errors if (pgStmt.trim().endsWith(';')) { pgStmt = pgStmt.trim().slice(0, -1) + ' ON CONFLICT DO NOTHING;'; } else { pgStmt += ' ON CONFLICT DO NOTHING;'; } await pool.query(pgStmt); } console.log(` ✓ ${insertStatements.length} records inserted into ${tableName}`); return insertStatements.length; } catch (error) { console.error(` ❌ Error loading ${filename}:`, error.message); throw error; } } /** * Execute a gzipped SQL file */ /** * Execute a gzipped SQL file using optimized bulk inserts */ async function executeGzippedSqlFile(filename, tableName) { const filePath = join(SQL_DIR, filename); console.log(` 📄 Loading ${filename} (gzipped)...`); try { // Read and decompress let sql = ''; const gunzip = createGunzip(); const readStream = createReadStream(filePath); await pipeline( readStream, gunzip, new Writable({ write(chunk, encoding, callback) { sql += chunk.toString(); callback(); } }) ); // Clean up postgres-specific commands sql = sql .replace(/\\restrict[^\n]*/g, '') .replace(/\\unrestrict[^\n]*/g, '') .replace(/^SELECT pg_catalog\.setval[^;]*;/gm, '') .replace(/^ALTER TABLE[^;]*OWNER TO[^;]*;/gm, '') .replace(/^COMMENT ON[^;]*;/gm, '') .replace(/^SET [a-z_]+\s*=/gmi, (match) => '-- ' + match) .replace(/^SET [a-z_]+;$/gmi, (match) => '-- ' + match) .replace(/^SELECT[^;]*set_config[^;]*;/gm, ''); // Extract only INSERT statements const insertStatements = sql.match(/INSERT INTO[^;]+;/g) || []; if (insertStatements.length === 0) { console.log(` ⚠️ No INSERT statements found in ${filename}`); return 0; } console.log(` 📊 Found ${insertStatements.length} records to process...`); console.log(` 🚀 Optimizing: Grouping into bulk INSERTs...`); // Helper to extract values part only const extractValues = (stmt) => { const match = stmt.match(/VALUES\s*\((.+)\);?$/is); if (!match) return null; return match[1]; }; const BATCH_SIZE = 2000; // Insert 2000 rows per query let processedCount = 0; // We need to determine the columns for the bulk insert // We'll peek at the first valid statement for each table type let columns = ""; let transformFunc = null; if (tableName === 'cities') { columns = "(id, name, state_id, state_code, country_id, country_code, latitude, longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id)"; transformFunc = (stmt) => { // Reuse existing logic to parse and map, but strip the "INSERT INTO..." wrapper // This is a bit inefficient (re-parsing) but safe given existing logic const fullStmt = transformCitiesInsert(stmt); return extractValues(fullStmt); }; } else { // Fallback for other tables if we use this function for them transformFunc = (stmt) => extractValues(stmt); } const valueBatches = []; let currentBatch = []; for (const stmt of insertStatements) { const values = transformFunc(stmt); if (values) { currentBatch.push(`(${values})`); if (currentBatch.length >= BATCH_SIZE) { valueBatches.push(currentBatch); currentBatch = []; } processedCount++; } } if (currentBatch.length > 0) valueBatches.push(currentBatch); // Execute batches console.log(` ⚡ Executing ${valueBatches.length} bulk queries...`); for (let i = 0; i < valueBatches.length; i++) { const batch = valueBatches[i]; const query = `INSERT INTO ${tableName} ${columns} VALUES ${batch.join(', ')} ON CONFLICT DO NOTHING`; await pool.query(query); if ((i + 1) % 10 === 0 || i === valueBatches.length - 1) { process.stdout.write(`\r ... ${Math.min((i + 1) * BATCH_SIZE, processedCount)} / ${processedCount} rows`); } } console.log(""); console.log(` ✓ ${processedCount} records inserted into ${tableName}`); return processedCount; } catch (error) { console.error(` ❌ Error loading ${filename}:`, error.message); throw error; } } /** * Seed all location data from SQL dumps */ /** * Seed base location data (Continents, Subregions, Countries) * This is fast and required for Company seeding */ export async function seedBaseLocations() { console.log('🌍 Seeding base location data (Continents -> Countries)...'); try { // 1. Continents (from regions.sql - 6 records) console.log('1️⃣ Seeding Continents...'); console.time(' ⏱️ Continents'); await executeSqlFile('regions.sql', 'continents'); console.timeEnd(' ⏱️ Continents'); // 2. Subregions (22 records) console.log('2️⃣ Seeding Subregions...'); console.time(' ⏱️ Subregions'); await executeSqlFile('subregions.sql', 'subregions'); console.timeEnd(' ⏱️ Subregions'); // 3. Countries (~250 records) console.log('3️⃣ Seeding Countries...'); console.time(' ⏱️ Countries'); await executeSqlFile('countries.sql', 'countries'); console.timeEnd(' ⏱️ Countries'); } catch (error) { console.error('❌ Base location seeding failed:', error.message); throw error; } } /** * Seed detailed location data (States, Cities) * This is slower/heavy and should be run after Users/Companies are ready */ export async function seedDetailedLocations() { console.log('🌍 Seeding detailed location data (States -> Cities)...'); try { // 4. States (~5400 records) console.log('4️⃣ Seeding States...'); console.time(' ⏱️ States'); await executeSqlFile('states.sql', 'states'); console.timeEnd(' ⏱️ States'); // 5. Cities (~160k records) - This is the big one console.log('5️⃣ Seeding Cities (this may take a while)...'); console.time(' ⏱️ Cities (Bulk Insert)'); await executeGzippedSqlFile('cities.sql.gz', 'cities'); console.timeEnd(' ⏱️ Cities (Bulk Insert)'); } catch (error) { console.error('❌ Detailed location seeding failed:', error.message); throw error; } } /** * Seed all location data from SQL dumps (Legacy wrapper) */ export async function seedLocationData() { console.log('🌍 Seeding comprehensive location data...'); console.log(' Source: GeoDB Cities (https://github.com/dr5hn/countries-states-cities-database)\n'); try { await seedBaseLocations(); await seedDetailedLocations(); console.log('\n ✅ Location data seeding complete!'); // Print counts const counts = await pool.query(` SELECT (SELECT COUNT(*) FROM continents) as continents, (SELECT COUNT(*) FROM subregions) as subregions, (SELECT COUNT(*) FROM countries) as countries, (SELECT COUNT(*) FROM states) as states, (SELECT COUNT(*) FROM cities) as cities `); const c = counts.rows[0]; console.log(` 📊 Totals: ${c.continents} continents, ${c.subregions} subregions, ${c.countries} countries, ${c.states} states, ${c.cities} cities`); } catch (error) { console.error('❌ Location seeding failed:', error.message); throw error; } } /** * Seed location data WITHOUT cities (fast mode for development) * Skips the ~153k cities import for faster database reset */ export async function seedLocationDataLite() { console.log('🌍 Seeding location data (LITE - no cities)...'); console.log(' Source: GeoDB Cities (https://github.com/dr5hn/countries-states-cities-database)'); console.log(' ⚡ Skipping cities for faster seeding\n'); try { // 1. Continents (from regions.sql - 6 records) console.log('1️⃣ Seeding Continents...'); await executeSqlFile('regions.sql', 'continents'); // 2. Subregions (22 records) console.log('2️⃣ Seeding Subregions...'); await executeSqlFile('subregions.sql', 'subregions'); // 3. Countries (~250 records) console.log('3️⃣ Seeding Countries...'); await executeSqlFile('countries.sql', 'countries'); // 4. States (~5400 records) console.log('4️⃣ Seeding States...'); await executeSqlFile('states.sql', 'states'); // 5. Skip cities console.log('5️⃣ ⏭️ Skipping Cities (use full seed for cities)\n'); console.log(' ✅ Location data LITE seeding complete!'); // Print counts (cities will be 0) const counts = await pool.query(` SELECT (SELECT COUNT(*) FROM continents) as continents, (SELECT COUNT(*) FROM subregions) as subregions, (SELECT COUNT(*) FROM countries) as countries, (SELECT COUNT(*) FROM states) as states, (SELECT COUNT(*) FROM cities) as cities `); const c = counts.rows[0]; console.log(` 📊 Totals: ${c.continents} continents, ${c.subregions} subregions, ${c.countries} countries, ${c.states} states, ${c.cities} cities`); } catch (error) { console.error('❌ Location seeding failed:', error.message); throw error; } } // For direct execution if (process.argv[1] === fileURLToPath(import.meta.url)) { import('../db.js').then(async ({ testConnection, closePool }) => { const connected = await testConnection(); if (!connected) { console.error('Could not connect to database'); process.exit(1); } await seedLocationData(); await closePool(); }); }