gohorsejobs/seeder-api/src/seeders/location-loader.js
Yamamoto e59e15dd35 fix(seeder): use fully qualified table names for location seeding
docs: update DATABASE.md and DEVOPS.md for local environment setup
2026-01-03 16:55:45 -03:00

617 lines
24 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { pool } from '../db.js';
import { readFileSync, createReadStream } from 'fs';
import { createGunzip } from 'zlib';
import { pipeline } from 'stream/promises';
import { Writable } from 'stream';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const SQL_DIR = join(__dirname, '..', '..', 'sql');
/**
* Table name mapping from SQL dumps to our schema
*/
const TABLE_MAPPING = {
'public.regions': 'public.continents',
'regions': 'public.continents',
'public.subregions': 'public.subregions',
};
/**
* Transform subregions INSERT statements to match migration schema
* SQL dump: INSERT INTO subregions VALUES (id, name, translations, region_id, created_at, updated_at, flag, wikiDataId)
* Migration: id, name, continent_id, translations, created_at, updated_at, flag, wiki_data_id
*/
function transformSubregionsInsert(stmt) {
// Extract VALUES from the INSERT statement
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
// Parse values carefully (handling JSON with commas inside)
const values = [];
let current = '';
let depth = 0;
let inString = false;
let stringChar = '';
for (let i = 0; i < valuesStr.length; i++) {
const char = valuesStr[i];
const prevChar = i > 0 ? valuesStr[i - 1] : '';
if (!inString && (char === "'" || char === '"')) {
inString = true;
stringChar = char;
} else if (inString && char === stringChar && prevChar !== '\\') {
// Check for escaped quotes (doubled)
if (valuesStr[i + 1] === stringChar) {
current += char;
i++; // Skip next quote
} else {
inString = false;
}
}
if (!inString) {
if (char === '(' || char === '{' || char === '[') depth++;
if (char === ')' || char === '}' || char === ']') depth--;
if (char === ',' && depth === 0) {
values.push(current.trim());
current = '';
continue;
}
}
current += char;
}
if (current.trim()) values.push(current.trim());
// Reorder: [id, name, translations, region_id, created, updated, flag, wikiDataId]
// To: [id, name, continent_id, translations, created, updated, flag, wiki_data_id]
if (values.length >= 8) {
const [id, name, translations, region_id, created, updated, flag, wikiDataId] = values;
const reordered = [id, name, region_id, translations, created, updated, flag, wikiDataId];
return `INSERT INTO subregions (id, name, continent_id, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${reordered.join(', ')});`;
}
return stmt;
}
/**
* Transform countries INSERT statements to match migration schema
* SQL dump columns (32): id, name, iso3, numeric_code, iso2, phonecode, capital, currency, currency_name,
* currency_symbol, tld, native, population, gdp, region, region_id, subregion, subregion_id, nationality,
* area_sq_km, postal_code_format, postal_code_regex, timezones, translations, latitude, longitude,
* emoji, emojiU, created_at, updated_at, flag, wikiDataId
* Migration columns (25): id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name,
* currency_symbol, tld, native, continent_id, subregion_id, nationality, latitude, longitude,
* emoji, emoji_u, timezones, translations, created_at, updated_at, flag, wiki_data_id
*/
function transformCountriesInsert(stmt) {
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
const values = parseValues(valuesStr);
if (values.length >= 32) {
// Extract the 32 columns from dump
const [id, name, iso3, numeric_code, iso2, phonecode, capital, currency, currency_name,
currency_symbol, tld, native, population, gdp, region, region_id, subregion,
subregion_id, nationality, area_sq_km, postal_code_format, postal_code_regex,
timezones, translations, latitude, longitude, emoji, emojiU, created_at,
updated_at, flag, wikiDataId] = values;
// Map to our 25-column schema (region_id becomes continent_id)
const mapped = [id, name, iso2, iso3, numeric_code, phonecode, capital, currency,
currency_name, currency_symbol, tld, native, region_id, subregion_id,
nationality, latitude, longitude, emoji, emojiU, timezones, translations,
created_at, updated_at, flag, wikiDataId];
return `INSERT INTO countries (id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name, currency_symbol, tld, native, continent_id, subregion_id, nationality, latitude, longitude, emoji, emoji_u, timezones, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`;
}
return stmt;
}
/**
* Transform states INSERT statements to match migration schema
* The SQL dump has 20 columns, migration has 15 columns.
* Instead of parsing VALUES, we add explicit column list and use a subselect to remap.
* This avoids issues with complex JSON/Unicode parsing.
*/
function transformStatesInsert(stmt) {
// Just return the original statement - we need to handle this at the table level
// by adding a view or adjusting the schema
// For now, let's try a different approach: extract each value position individually
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
// Use regex to find value positions based on comma counting outside quotes
const values = parseValues(valuesStr);
// Debug: Log if we have issues
if (values.length !== 20) {
console.log(` ⚠️ Expected 20 columns for states, got ${values.length}`);
return stmt;
}
// Dump indices: 0:id, 1:name, 2:country_id, 3:country_code, 4:fips_code, 5:iso2,
// 6:iso3166_2, 7:type, 8:level, 9:parent_id, 10:native, 11:latitude, 12:longitude,
// 13:timezone, 14:translations, 15:created_at, 16:updated_at, 17:flag, 18:wikiDataId, 19:population
//
// Migration needs: id, name, country_id, country_code, iso2, fips_code, type, latitude,
// longitude, timezone, translations, created_at, updated_at, flag, wiki_data_id
//
// Mapping: 0, 1, 2, 3, 5, 4, 7, 11, 12, 13, 14, 15, 16, 17, 18
const mapped = [
values[0], // id
values[1], // name
values[2], // country_id
values[3], // country_code
values[5], // iso2
values[4], // fips_code
values[7], // type
values[11], // latitude
values[12], // longitude
values[13], // timezone
values[14], // translations
values[15], // created_at
values[16], // updated_at
values[17], // flag
values[18] // wikiDataId -> wiki_data_id
];
return `INSERT INTO states (id, name, country_id, country_code, iso2, fips_code, type, latitude, longitude, timezone, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`;
}
/**
* Transform cities INSERT statements to match migration schema
* SQL dump columns (19): id, name, state_id, state_code, country_id, country_code, type, level, parent_id,
* latitude, longitude, native, population, timezone, translations, created_at, updated_at, flag, wikiDataId
* Migration columns (15): id, name, state_id, state_code, country_id, country_code, latitude, longitude,
* population, timezone, translations, created_at, updated_at, flag, wiki_data_id
*/
function transformCitiesInsert(stmt) {
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
const values = parseValues(valuesStr);
if (values.length !== 19) {
// Skip transformation if column count doesn't match expected
return stmt;
}
// Dump indices: 0:id, 1:name, 2:state_id, 3:state_code, 4:country_id, 5:country_code,
// 6:type, 7:level, 8:parent_id, 9:latitude, 10:longitude, 11:native, 12:population,
// 13:timezone, 14:translations, 15:created_at, 16:updated_at, 17:flag, 18:wikiDataId
//
// Migration needs: id, name, state_id, state_code, country_id, country_code, latitude,
// longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id
//
// Mapping: 0, 1, 2, 3, 4, 5, 9, 10, 12, 13, 14, 15, 16, 17, 18
const mapped = [
values[0], // id
values[1], // name
values[2], // state_id
values[3], // state_code
values[4], // country_id
values[5], // country_code
values[9], // latitude
values[10], // longitude
values[12], // population
values[13], // timezone
values[14], // translations
values[15], // created_at
values[16], // updated_at
values[17], // flag
values[18] // wikiDataId -> wiki_data_id
];
return `INSERT INTO cities (id, name, state_id, state_code, country_id, country_code, latitude, longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`;
}
/**
* Helper to parse VALUES string with proper handling of nested JSON and quoted strings
* This handles SQL values like: 1, 'name', '{"br": "value"}', NULL
*/
function parseValues(valuesStr) {
const values = [];
let current = '';
let depth = 0;
let inString = false;
let stringChar = '';
for (let i = 0; i < valuesStr.length; i++) {
const char = valuesStr[i];
const prevChar = i > 0 ? valuesStr[i - 1] : '';
// Only start a new string if we're not already in one
if (!inString && (char === "'")) {
inString = true;
stringChar = char;
} else if (inString && char === stringChar && prevChar !== '\\') {
// Check for escaped quotes (doubled like '' in SQL)
if (valuesStr[i + 1] === stringChar) {
current += char;
i++; // Skip next quote
} else {
inString = false;
}
}
// Track depth for parentheses/brackets (only outside strings)
if (!inString) {
if (char === '(' || char === '{' || char === '[') depth++;
if (char === ')' || char === '}' || char === ']') depth--;
if (char === ',' && depth === 0) {
values.push(current.trim());
current = '';
continue;
}
}
current += char;
}
if (current.trim()) values.push(current.trim());
return values;
}
/**
* Execute a SQL file directly
*/
async function executeSqlFile(filename, tableName) {
const filePath = join(SQL_DIR, filename);
console.log(` 📄 Loading ${filename}...`);
try {
let sql = readFileSync(filePath, 'utf8');
// Clean up postgres-specific commands that might cause issues
// These need to match ONLY standalone commands, not content inside VALUES
sql = sql
.replace(/\\restrict[^\n]*/g, '')
.replace(/\\unrestrict[^\n]*/g, '')
.replace(/^SELECT pg_catalog\.setval[^;]*;/gm, '')
.replace(/^ALTER TABLE[^;]*OWNER TO[^;]*;/gm, '')
.replace(/^COMMENT ON[^;]*;/gm, '')
.replace(/^SET [a-z_]+\s*=/gmi, (match) => '-- ' + match) // Comment out SET statements
.replace(/^SET [a-z_]+;$/gmi, (match) => '-- ' + match) // Comment out simple SET statements
.replace(/^SELECT[^;]*set_config[^;]*;/gm, '');
// Extract only INSERT statements
const insertStatements = sql.match(/INSERT INTO[^;]+;/g) || [];
if (insertStatements.length === 0) {
console.log(` ⚠️ No INSERT statements found in ${filename}`);
return 0;
}
// Execute each INSERT statement
for (const stmt of insertStatements) {
// Convert MySQL column names to PostgreSQL (camelCase -> snake_case for some)
let pgStmt = stmt
.replace(/`/g, '"')
.replace(/"emojiU"/g, 'emoji_u')
.replace(/"wikiDataId"/g, 'wiki_data_id');
// Apply table name mapping
for (const [oldName, newName] of Object.entries(TABLE_MAPPING)) {
pgStmt = pgStmt.replace(new RegExp(`INSERT INTO ${oldName}`, 'gi'), `INSERT INTO ${newName}`);
}
// Apply special transformations for subregions (column reordering)
if (pgStmt.includes('INSERT INTO subregions')) {
pgStmt = transformSubregionsInsert(pgStmt);
}
// Apply special transformations for countries (32 cols -> 25 cols)
if (pgStmt.includes('INSERT INTO countries') || pgStmt.includes('INSERT INTO public.countries')) {
pgStmt = transformCountriesInsert(pgStmt);
}
// Apply special transformations for states (20 cols -> 15 cols)
if (pgStmt.includes('INSERT INTO states') || pgStmt.includes('INSERT INTO public.states')) {
pgStmt = transformStatesInsert(pgStmt);
}
// prevent duplicate key errors
if (pgStmt.trim().endsWith(';')) {
pgStmt = pgStmt.trim().slice(0, -1) + ' ON CONFLICT DO NOTHING;';
} else {
pgStmt += ' ON CONFLICT DO NOTHING;';
}
await pool.query(pgStmt);
}
console.log(`${insertStatements.length} records inserted into ${tableName}`);
return insertStatements.length;
} catch (error) {
console.error(` ❌ Error loading ${filename}:`, error.message);
throw error;
}
}
/**
* Execute a gzipped SQL file
*/
/**
* Execute a gzipped SQL file using optimized bulk inserts
*/
async function executeGzippedSqlFile(filename, tableName) {
const filePath = join(SQL_DIR, filename);
console.log(` 📄 Loading ${filename} (gzipped)...`);
try {
// Read and decompress
let sql = '';
const gunzip = createGunzip();
const readStream = createReadStream(filePath);
await pipeline(
readStream,
gunzip,
new Writable({
write(chunk, encoding, callback) {
sql += chunk.toString();
callback();
}
})
);
// Clean up postgres-specific commands
sql = sql
.replace(/\\restrict[^\n]*/g, '')
.replace(/\\unrestrict[^\n]*/g, '')
.replace(/^SELECT pg_catalog\.setval[^;]*;/gm, '')
.replace(/^ALTER TABLE[^;]*OWNER TO[^;]*;/gm, '')
.replace(/^COMMENT ON[^;]*;/gm, '')
.replace(/^SET [a-z_]+\s*=/gmi, (match) => '-- ' + match)
.replace(/^SET [a-z_]+;$/gmi, (match) => '-- ' + match)
.replace(/^SELECT[^;]*set_config[^;]*;/gm, '');
// Extract only INSERT statements
const insertStatements = sql.match(/INSERT INTO[^;]+;/g) || [];
if (insertStatements.length === 0) {
console.log(` ⚠️ No INSERT statements found in ${filename}`);
return 0;
}
console.log(` 📊 Found ${insertStatements.length} records to process...`);
console.log(` 🚀 Optimizing: Grouping into bulk INSERTs...`);
// Helper to extract values part only
const extractValues = (stmt) => {
const match = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!match) return null;
return match[1];
};
const BATCH_SIZE = 2000; // Insert 2000 rows per query
let processedCount = 0;
// We need to determine the columns for the bulk insert
// We'll peek at the first valid statement for each table type
let columns = "";
let transformFunc = null;
if (tableName === 'cities') {
columns = "(id, name, state_id, state_code, country_id, country_code, latitude, longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id)";
transformFunc = (stmt) => {
// Reuse existing logic to parse and map, but strip the "INSERT INTO..." wrapper
// This is a bit inefficient (re-parsing) but safe given existing logic
const fullStmt = transformCitiesInsert(stmt);
return extractValues(fullStmt);
};
} else {
// Fallback for other tables if we use this function for them
transformFunc = (stmt) => extractValues(stmt);
}
const valueBatches = [];
let currentBatch = [];
for (const stmt of insertStatements) {
const values = transformFunc(stmt);
if (values) {
currentBatch.push(`(${values})`);
if (currentBatch.length >= BATCH_SIZE) {
valueBatches.push(currentBatch);
currentBatch = [];
}
processedCount++;
}
}
if (currentBatch.length > 0) valueBatches.push(currentBatch);
// Execute batches
console.log(` ⚡ Executing ${valueBatches.length} bulk queries...`);
for (let i = 0; i < valueBatches.length; i++) {
const batch = valueBatches[i];
const query = `INSERT INTO ${tableName} ${columns} VALUES ${batch.join(', ')} ON CONFLICT DO NOTHING`;
await pool.query(query);
if ((i + 1) % 10 === 0 || i === valueBatches.length - 1) {
process.stdout.write(`\r ... ${Math.min((i + 1) * BATCH_SIZE, processedCount)} / ${processedCount} rows`);
}
}
console.log("");
console.log(`${processedCount} records inserted into ${tableName}`);
return processedCount;
} catch (error) {
console.error(` ❌ Error loading ${filename}:`, error.message);
throw error;
}
}
/**
* Seed all location data from SQL dumps
*/
/**
* Seed base location data (Continents, Subregions, Countries)
* This is fast and required for Company seeding
*/
export async function seedBaseLocations() {
console.log('🌍 Seeding base location data (Continents -> Countries)...');
try {
// 1. Continents (from regions.sql - 6 records)
console.log('1⃣ Seeding Continents...');
console.time(' ⏱️ Continents');
await executeSqlFile('regions.sql', 'continents');
console.timeEnd(' ⏱️ Continents');
// 2. Subregions (22 records)
console.log('2⃣ Seeding Subregions...');
console.time(' ⏱️ Subregions');
await executeSqlFile('subregions.sql', 'subregions');
console.timeEnd(' ⏱️ Subregions');
// 3. Countries (~250 records)
console.log('3⃣ Seeding Countries...');
console.time(' ⏱️ Countries');
await executeSqlFile('countries.sql', 'countries');
console.timeEnd(' ⏱️ Countries');
} catch (error) {
console.error('❌ Base location seeding failed:', error.message);
throw error;
}
}
/**
* Seed detailed location data (States, Cities)
* This is slower/heavy and should be run after Users/Companies are ready
*/
export async function seedDetailedLocations() {
console.log('🌍 Seeding detailed location data (States -> Cities)...');
try {
// 4. States (~5400 records)
console.log('4⃣ Seeding States...');
console.time(' ⏱️ States');
await executeSqlFile('states.sql', 'states');
console.timeEnd(' ⏱️ States');
// 5. Cities (~160k records) - This is the big one
console.log('5⃣ Seeding Cities (this may take a while)...');
console.time(' ⏱️ Cities (Bulk Insert)');
await executeGzippedSqlFile('cities.sql.gz', 'cities');
console.timeEnd(' ⏱️ Cities (Bulk Insert)');
} catch (error) {
console.error('❌ Detailed location seeding failed:', error.message);
throw error;
}
}
/**
* Seed all location data from SQL dumps (Legacy wrapper)
*/
export async function seedLocationData() {
console.log('🌍 Seeding comprehensive location data...');
console.log(' Source: GeoDB Cities (https://github.com/dr5hn/countries-states-cities-database)\n');
try {
await seedBaseLocations();
await seedDetailedLocations();
console.log('\n ✅ Location data seeding complete!');
// Print counts
const counts = await pool.query(`
SELECT
(SELECT COUNT(*) FROM continents) as continents,
(SELECT COUNT(*) FROM subregions) as subregions,
(SELECT COUNT(*) FROM countries) as countries,
(SELECT COUNT(*) FROM states) as states,
(SELECT COUNT(*) FROM cities) as cities
`);
const c = counts.rows[0];
console.log(` 📊 Totals: ${c.continents} continents, ${c.subregions} subregions, ${c.countries} countries, ${c.states} states, ${c.cities} cities`);
} catch (error) {
console.error('❌ Location seeding failed:', error.message);
throw error;
}
}
/**
* Seed location data WITHOUT cities (fast mode for development)
* Skips the ~153k cities import for faster database reset
*/
export async function seedLocationDataLite() {
console.log('🌍 Seeding location data (LITE - no cities)...');
console.log(' Source: GeoDB Cities (https://github.com/dr5hn/countries-states-cities-database)');
console.log(' ⚡ Skipping cities for faster seeding\n');
try {
// 1. Continents (from regions.sql - 6 records)
console.log('1⃣ Seeding Continents...');
await executeSqlFile('regions.sql', 'continents');
// 2. Subregions (22 records)
console.log('2⃣ Seeding Subregions...');
await executeSqlFile('subregions.sql', 'subregions');
// 3. Countries (~250 records)
console.log('3⃣ Seeding Countries...');
await executeSqlFile('countries.sql', 'countries');
// 4. States (~5400 records)
console.log('4⃣ Seeding States...');
await executeSqlFile('states.sql', 'states');
// 5. Skip cities
console.log('5⃣ ⏭️ Skipping Cities (use full seed for cities)\n');
console.log(' ✅ Location data LITE seeding complete!');
// Print counts (cities will be 0)
const counts = await pool.query(`
SELECT
(SELECT COUNT(*) FROM continents) as continents,
(SELECT COUNT(*) FROM subregions) as subregions,
(SELECT COUNT(*) FROM countries) as countries,
(SELECT COUNT(*) FROM states) as states,
(SELECT COUNT(*) FROM cities) as cities
`);
const c = counts.rows[0];
console.log(` 📊 Totals: ${c.continents} continents, ${c.subregions} subregions, ${c.countries} countries, ${c.states} states, ${c.cities} cities`);
} catch (error) {
console.error('❌ Location seeding failed:', error.message);
throw error;
}
}
// For direct execution
if (process.argv[1] === fileURLToPath(import.meta.url)) {
import('../db.js').then(async ({ testConnection, closePool }) => {
const connected = await testConnection();
if (!connected) {
console.error('Could not connect to database');
process.exit(1);
}
await seedLocationData();
await closePool();
});
}