fix(seeder): fix location data column schema transformations

- Fixed regex patterns that incorrectly matched SET inside quoted values
- Added transformSubregionsInsert() - maps 8 dump cols to schema cols
- Added transformCountriesInsert() - maps 32 dump cols to 25 schema cols
- Added transformStatesInsert() - maps 20 dump cols to 15 schema cols
- Added transformCitiesInsert() - maps 19 dump cols to 15 schema cols
- Added parseValues() helper for parsing SQL VALUES with JSON handling
- Successfully seeds: continents(6), subregions(22), countries(250), states(5296)
This commit is contained in:
Tiago Yamamoto 2025-12-24 16:12:29 -03:00
parent ac84571c55
commit 7720f2e35e

View file

@ -15,8 +15,260 @@ const SQL_DIR = join(__dirname, '..', '..', 'sql');
const TABLE_MAPPING = {
'public.regions': 'continents',
'regions': 'continents',
'public.subregions': 'subregions',
};
/**
* Transform subregions INSERT statements to match migration schema
* SQL dump: INSERT INTO subregions VALUES (id, name, translations, region_id, created_at, updated_at, flag, wikiDataId)
* Migration: id, name, continent_id, translations, created_at, updated_at, flag, wiki_data_id
*/
function transformSubregionsInsert(stmt) {
// Extract VALUES from the INSERT statement
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
// Parse values carefully (handling JSON with commas inside)
const values = [];
let current = '';
let depth = 0;
let inString = false;
let stringChar = '';
for (let i = 0; i < valuesStr.length; i++) {
const char = valuesStr[i];
const prevChar = i > 0 ? valuesStr[i - 1] : '';
if (!inString && (char === "'" || char === '"')) {
inString = true;
stringChar = char;
} else if (inString && char === stringChar && prevChar !== '\\') {
// Check for escaped quotes (doubled)
if (valuesStr[i + 1] === stringChar) {
current += char;
i++; // Skip next quote
} else {
inString = false;
}
}
if (!inString) {
if (char === '(' || char === '{' || char === '[') depth++;
if (char === ')' || char === '}' || char === ']') depth--;
if (char === ',' && depth === 0) {
values.push(current.trim());
current = '';
continue;
}
}
current += char;
}
if (current.trim()) values.push(current.trim());
// Reorder: [id, name, translations, region_id, created, updated, flag, wikiDataId]
// To: [id, name, continent_id, translations, created, updated, flag, wiki_data_id]
if (values.length >= 8) {
const [id, name, translations, region_id, created, updated, flag, wikiDataId] = values;
const reordered = [id, name, region_id, translations, created, updated, flag, wikiDataId];
return `INSERT INTO subregions (id, name, continent_id, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${reordered.join(', ')});`;
}
return stmt;
}
/**
* Transform countries INSERT statements to match migration schema
* SQL dump columns (32): id, name, iso3, numeric_code, iso2, phonecode, capital, currency, currency_name,
* currency_symbol, tld, native, population, gdp, region, region_id, subregion, subregion_id, nationality,
* area_sq_km, postal_code_format, postal_code_regex, timezones, translations, latitude, longitude,
* emoji, emojiU, created_at, updated_at, flag, wikiDataId
* Migration columns (25): id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name,
* currency_symbol, tld, native, continent_id, subregion_id, nationality, latitude, longitude,
* emoji, emoji_u, timezones, translations, created_at, updated_at, flag, wiki_data_id
*/
function transformCountriesInsert(stmt) {
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
const values = parseValues(valuesStr);
if (values.length >= 32) {
// Extract the 32 columns from dump
const [id, name, iso3, numeric_code, iso2, phonecode, capital, currency, currency_name,
currency_symbol, tld, native, population, gdp, region, region_id, subregion,
subregion_id, nationality, area_sq_km, postal_code_format, postal_code_regex,
timezones, translations, latitude, longitude, emoji, emojiU, created_at,
updated_at, flag, wikiDataId] = values;
// Map to our 25-column schema (region_id becomes continent_id)
const mapped = [id, name, iso2, iso3, numeric_code, phonecode, capital, currency,
currency_name, currency_symbol, tld, native, region_id, subregion_id,
nationality, latitude, longitude, emoji, emojiU, timezones, translations,
created_at, updated_at, flag, wikiDataId];
return `INSERT INTO countries (id, name, iso2, iso3, numeric_code, phonecode, capital, currency, currency_name, currency_symbol, tld, native, continent_id, subregion_id, nationality, latitude, longitude, emoji, emoji_u, timezones, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`;
}
return stmt;
}
/**
* Transform states INSERT statements to match migration schema
* The SQL dump has 20 columns, migration has 15 columns.
* Instead of parsing VALUES, we add explicit column list and use a subselect to remap.
* This avoids issues with complex JSON/Unicode parsing.
*/
function transformStatesInsert(stmt) {
// Just return the original statement - we need to handle this at the table level
// by adding a view or adjusting the schema
// For now, let's try a different approach: extract each value position individually
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
// Use regex to find value positions based on comma counting outside quotes
const values = parseValues(valuesStr);
// Debug: Log if we have issues
if (values.length !== 20) {
console.log(` ⚠️ Expected 20 columns for states, got ${values.length}`);
return stmt;
}
// Dump indices: 0:id, 1:name, 2:country_id, 3:country_code, 4:fips_code, 5:iso2,
// 6:iso3166_2, 7:type, 8:level, 9:parent_id, 10:native, 11:latitude, 12:longitude,
// 13:timezone, 14:translations, 15:created_at, 16:updated_at, 17:flag, 18:wikiDataId, 19:population
//
// Migration needs: id, name, country_id, country_code, iso2, fips_code, type, latitude,
// longitude, timezone, translations, created_at, updated_at, flag, wiki_data_id
//
// Mapping: 0, 1, 2, 3, 5, 4, 7, 11, 12, 13, 14, 15, 16, 17, 18
const mapped = [
values[0], // id
values[1], // name
values[2], // country_id
values[3], // country_code
values[5], // iso2
values[4], // fips_code
values[7], // type
values[11], // latitude
values[12], // longitude
values[13], // timezone
values[14], // translations
values[15], // created_at
values[16], // updated_at
values[17], // flag
values[18] // wikiDataId -> wiki_data_id
];
return `INSERT INTO states (id, name, country_id, country_code, iso2, fips_code, type, latitude, longitude, timezone, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`;
}
/**
* Transform cities INSERT statements to match migration schema
* SQL dump columns (19): id, name, state_id, state_code, country_id, country_code, type, level, parent_id,
* latitude, longitude, native, population, timezone, translations, created_at, updated_at, flag, wikiDataId
* Migration columns (15): id, name, state_id, state_code, country_id, country_code, latitude, longitude,
* population, timezone, translations, created_at, updated_at, flag, wiki_data_id
*/
function transformCitiesInsert(stmt) {
const valuesMatch = stmt.match(/VALUES\s*\((.+)\);?$/is);
if (!valuesMatch) return stmt;
const valuesStr = valuesMatch[1];
const values = parseValues(valuesStr);
if (values.length !== 19) {
// Skip transformation if column count doesn't match expected
return stmt;
}
// Dump indices: 0:id, 1:name, 2:state_id, 3:state_code, 4:country_id, 5:country_code,
// 6:type, 7:level, 8:parent_id, 9:latitude, 10:longitude, 11:native, 12:population,
// 13:timezone, 14:translations, 15:created_at, 16:updated_at, 17:flag, 18:wikiDataId
//
// Migration needs: id, name, state_id, state_code, country_id, country_code, latitude,
// longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id
//
// Mapping: 0, 1, 2, 3, 4, 5, 9, 10, 12, 13, 14, 15, 16, 17, 18
const mapped = [
values[0], // id
values[1], // name
values[2], // state_id
values[3], // state_code
values[4], // country_id
values[5], // country_code
values[9], // latitude
values[10], // longitude
values[12], // population
values[13], // timezone
values[14], // translations
values[15], // created_at
values[16], // updated_at
values[17], // flag
values[18] // wikiDataId -> wiki_data_id
];
return `INSERT INTO cities (id, name, state_id, state_code, country_id, country_code, latitude, longitude, population, timezone, translations, created_at, updated_at, flag, wiki_data_id) VALUES (${mapped.join(', ')});`;
}
/**
* Helper to parse VALUES string with proper handling of nested JSON and quoted strings
* This handles SQL values like: 1, 'name', '{"br": "value"}', NULL
*/
function parseValues(valuesStr) {
const values = [];
let current = '';
let depth = 0;
let inString = false;
let stringChar = '';
for (let i = 0; i < valuesStr.length; i++) {
const char = valuesStr[i];
const prevChar = i > 0 ? valuesStr[i - 1] : '';
// Only start a new string if we're not already in one
if (!inString && (char === "'")) {
inString = true;
stringChar = char;
} else if (inString && char === stringChar && prevChar !== '\\') {
// Check for escaped quotes (doubled like '' in SQL)
if (valuesStr[i + 1] === stringChar) {
current += char;
i++; // Skip next quote
} else {
inString = false;
}
}
// Track depth for parentheses/brackets (only outside strings)
if (!inString) {
if (char === '(' || char === '{' || char === '[') depth++;
if (char === ')' || char === '}' || char === ']') depth--;
if (char === ',' && depth === 0) {
values.push(current.trim());
current = '';
continue;
}
}
current += char;
}
if (current.trim()) values.push(current.trim());
return values;
}
/**
* Execute a SQL file directly
*/
@ -28,14 +280,16 @@ async function executeSqlFile(filename, tableName) {
let sql = readFileSync(filePath, 'utf8');
// Clean up postgres-specific commands that might cause issues
// These need to match ONLY standalone commands, not content inside VALUES
sql = sql
.replace(/\\restrict[^\n]*/g, '')
.replace(/\\unrestrict[^\n]*/g, '')
.replace(/SELECT pg_catalog\.setval[^;]*;/g, '')
.replace(/ALTER TABLE[^;]*OWNER TO[^;]*;/g, '')
.replace(/COMMENT ON[^;]*;/g, '')
.replace(/SET[^;]*;/g, '')
.replace(/SELECT[^;]*set_config[^;]*;/g, '');
.replace(/^SELECT pg_catalog\.setval[^;]*;/gm, '')
.replace(/^ALTER TABLE[^;]*OWNER TO[^;]*;/gm, '')
.replace(/^COMMENT ON[^;]*;/gm, '')
.replace(/^SET [a-z_]+\s*=/gmi, (match) => '-- ' + match) // Comment out SET statements
.replace(/^SET [a-z_]+;$/gmi, (match) => '-- ' + match) // Comment out simple SET statements
.replace(/^SELECT[^;]*set_config[^;]*;/gm, '');
// Extract only INSERT statements
const insertStatements = sql.match(/INSERT INTO[^;]+;/g) || [];
@ -58,6 +312,21 @@ async function executeSqlFile(filename, tableName) {
pgStmt = pgStmt.replace(new RegExp(`INSERT INTO ${oldName}`, 'gi'), `INSERT INTO ${newName}`);
}
// Apply special transformations for subregions (column reordering)
if (pgStmt.includes('INSERT INTO subregions')) {
pgStmt = transformSubregionsInsert(pgStmt);
}
// Apply special transformations for countries (32 cols -> 25 cols)
if (pgStmt.includes('INSERT INTO countries') || pgStmt.includes('INSERT INTO public.countries')) {
pgStmt = transformCountriesInsert(pgStmt);
}
// Apply special transformations for states (20 cols -> 15 cols)
if (pgStmt.includes('INSERT INTO states') || pgStmt.includes('INSERT INTO public.states')) {
pgStmt = transformStatesInsert(pgStmt);
}
await pool.query(pgStmt);
}
@ -93,15 +362,17 @@ async function executeGzippedSqlFile(filename, tableName) {
})
);
// Clean up postgres-specific commands
// Clean up postgres-specific commands that might cause issues
// These need to match ONLY standalone commands, not content inside VALUES
sql = sql
.replace(/\\restrict[^\n]*/g, '')
.replace(/\\unrestrict[^\n]*/g, '')
.replace(/SELECT pg_catalog\.setval[^;]*;/g, '')
.replace(/ALTER TABLE[^;]*OWNER TO[^;]*;/g, '')
.replace(/COMMENT ON[^;]*;/g, '')
.replace(/SET[^;]*;/g, '')
.replace(/SELECT[^;]*set_config[^;]*;/g, '');
.replace(/^SELECT pg_catalog\.setval[^;]*;/gm, '')
.replace(/^ALTER TABLE[^;]*OWNER TO[^;]*;/gm, '')
.replace(/^COMMENT ON[^;]*;/gm, '')
.replace(/^SET [a-z_]+\s*=/gmi, (match) => '-- ' + match)
.replace(/^SET [a-z_]+;$/gmi, (match) => '-- ' + match)
.replace(/^SELECT[^;]*set_config[^;]*;/gm, '');
// Extract only INSERT statements
const insertStatements = sql.match(/INSERT INTO[^;]+;/g) || [];
@ -122,6 +393,12 @@ async function executeGzippedSqlFile(filename, tableName) {
.replace(/`/g, '"')
.replace(/"emojiU"/g, 'emoji_u')
.replace(/"wikiDataId"/g, 'wiki_data_id');
// Apply special transformations for cities (19 cols -> 15 cols)
if (pgStmt.includes('INSERT INTO cities') || pgStmt.includes('INSERT INTO public.cities')) {
pgStmt = transformCitiesInsert(pgStmt);
}
await pool.query(pgStmt);
}
if ((i + BATCH_SIZE) % 10000 === 0 || i + BATCH_SIZE >= insertStatements.length) {