|
|
|
|
@@ -3,13 +3,26 @@
|
|
|
|
|
*
|
|
|
|
|
* Generates and caches embeddings for categories, themes, and colors.
|
|
|
|
|
* Excludes "Black Friday", "Gifts", "Deals" categories and their children.
|
|
|
|
|
*
|
|
|
|
|
* Disk cache: embeddings are saved to data/taxonomy-embeddings.json and reused
|
|
|
|
|
* across server restarts. Cache is invalidated by content hash — if the taxonomy
|
|
|
|
|
* rows in MySQL change, the next check will detect it and regenerate automatically.
|
|
|
|
|
*
|
|
|
|
|
* Background check: after initialization, call startBackgroundCheck(getConnectionFn)
|
|
|
|
|
* to poll for taxonomy changes on a configurable interval (default 1h).
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
const fs = require('fs');
|
|
|
|
|
const path = require('path');
|
|
|
|
|
const crypto = require('crypto');
|
|
|
|
|
const { findTopMatches } = require('./similarity');
|
|
|
|
|
|
|
|
|
|
// Categories to exclude (and all their children)
|
|
|
|
|
const EXCLUDED_CATEGORY_NAMES = ['black friday', 'gifts', 'deals'];
|
|
|
|
|
|
|
|
|
|
// Disk cache config
|
|
|
|
|
const CACHE_PATH = path.join(__dirname, '..', '..', '..', '..', 'data', 'taxonomy-embeddings.json');
|
|
|
|
|
|
|
|
|
|
class TaxonomyEmbeddings {
|
|
|
|
|
constructor({ provider, logger }) {
|
|
|
|
|
this.provider = provider;
|
|
|
|
|
@@ -25,12 +38,18 @@ class TaxonomyEmbeddings {
|
|
|
|
|
this.themeMap = new Map();
|
|
|
|
|
this.colorMap = new Map();
|
|
|
|
|
|
|
|
|
|
// Content hash of the last successfully built taxonomy (from DB rows)
|
|
|
|
|
this.contentHash = null;
|
|
|
|
|
|
|
|
|
|
this.initialized = false;
|
|
|
|
|
this.initializing = false;
|
|
|
|
|
this._checkInterval = null;
|
|
|
|
|
this._regenerating = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Initialize embeddings - fetch taxonomy and generate embeddings
|
|
|
|
|
* Initialize embeddings — fetches raw taxonomy rows to compute a content hash,
|
|
|
|
|
* then either loads the matching disk cache or generates fresh embeddings.
|
|
|
|
|
*/
|
|
|
|
|
async initialize(connection) {
|
|
|
|
|
if (this.initialized) {
|
|
|
|
|
@@ -48,42 +67,36 @@ class TaxonomyEmbeddings {
|
|
|
|
|
this.initializing = true;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
this.logger.info('[TaxonomyEmbeddings] Starting initialization...');
|
|
|
|
|
// Always fetch raw rows first — cheap (~10ms), no OpenAI calls.
|
|
|
|
|
// Used to compute a content hash for cache validation.
|
|
|
|
|
const rawRows = await this._fetchRawRows(connection);
|
|
|
|
|
const freshHash = this._computeContentHash(rawRows);
|
|
|
|
|
|
|
|
|
|
// Fetch raw taxonomy data
|
|
|
|
|
const [categories, themes, colors] = await Promise.all([
|
|
|
|
|
this._fetchCategories(connection),
|
|
|
|
|
this._fetchThemes(connection),
|
|
|
|
|
this._fetchColors(connection)
|
|
|
|
|
]);
|
|
|
|
|
const cached = this._loadCache();
|
|
|
|
|
if (cached && cached.contentHash === freshHash) {
|
|
|
|
|
this.categories = cached.categories;
|
|
|
|
|
this.themes = cached.themes;
|
|
|
|
|
this.colors = cached.colors;
|
|
|
|
|
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
|
|
|
|
|
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
|
|
|
|
|
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
|
|
|
|
|
this.contentHash = freshHash;
|
|
|
|
|
this.initialized = true;
|
|
|
|
|
this.logger.info(`[TaxonomyEmbeddings] Loaded from cache: ${this.categories.length} categories, ${this.themes.length} themes, ${this.colors.length} colors`);
|
|
|
|
|
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.logger.info(`[TaxonomyEmbeddings] Fetched ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
|
|
|
|
|
|
|
|
|
|
// Generate embeddings in parallel
|
|
|
|
|
const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
|
|
|
|
|
this._generateEmbeddings(categories, 'categories'),
|
|
|
|
|
this._generateEmbeddings(themes, 'themes'),
|
|
|
|
|
this._generateEmbeddings(colors, 'colors')
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
// Store with embeddings
|
|
|
|
|
this.categories = catEmbeddings;
|
|
|
|
|
this.themes = themeEmbeddings;
|
|
|
|
|
this.colors = colorEmbeddings;
|
|
|
|
|
|
|
|
|
|
// Build lookup maps
|
|
|
|
|
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
|
|
|
|
|
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
|
|
|
|
|
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
|
|
|
|
|
if (cached) {
|
|
|
|
|
this.logger.info('[TaxonomyEmbeddings] Taxonomy changed since cache was built, regenerating...');
|
|
|
|
|
} else {
|
|
|
|
|
this.logger.info('[TaxonomyEmbeddings] No cache — fetching taxonomy and generating embeddings...');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await this._buildAndEmbed(rawRows, freshHash);
|
|
|
|
|
this.initialized = true;
|
|
|
|
|
this.logger.info('[TaxonomyEmbeddings] Initialization complete');
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
categories: this.categories.length,
|
|
|
|
|
themes: this.themes.length,
|
|
|
|
|
colors: this.colors.length
|
|
|
|
|
};
|
|
|
|
|
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
|
|
|
|
|
} catch (error) {
|
|
|
|
|
this.logger.error('[TaxonomyEmbeddings] Initialization failed:', error);
|
|
|
|
|
throw error;
|
|
|
|
|
@@ -92,6 +105,47 @@ class TaxonomyEmbeddings {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Start a background interval that checks for taxonomy changes and regenerates
|
|
|
|
|
* embeddings automatically if the content hash differs.
|
|
|
|
|
*
|
|
|
|
|
* @param {Function} getConnectionFn - async function returning { connection }
|
|
|
|
|
* @param {number} intervalMs - check interval, default 1 hour
|
|
|
|
|
*/
|
|
|
|
|
startBackgroundCheck(getConnectionFn, intervalMs = 60 * 60 * 1000) {
|
|
|
|
|
if (this._checkInterval) return;
|
|
|
|
|
|
|
|
|
|
this.logger.info(`[TaxonomyEmbeddings] Background taxonomy check started (every ${intervalMs / 60000} min)`);
|
|
|
|
|
|
|
|
|
|
this._checkInterval = setInterval(async () => {
|
|
|
|
|
if (this._regenerating) return;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const { connection } = await getConnectionFn();
|
|
|
|
|
const rawRows = await this._fetchRawRows(connection);
|
|
|
|
|
const freshHash = this._computeContentHash(rawRows);
|
|
|
|
|
|
|
|
|
|
if (freshHash === this.contentHash) return;
|
|
|
|
|
|
|
|
|
|
this.logger.info('[TaxonomyEmbeddings] Taxonomy changed, regenerating embeddings in background...');
|
|
|
|
|
this._regenerating = true;
|
|
|
|
|
await this._buildAndEmbed(rawRows, freshHash);
|
|
|
|
|
this.logger.info('[TaxonomyEmbeddings] Background regeneration complete');
|
|
|
|
|
} catch (err) {
|
|
|
|
|
this.logger.warn('[TaxonomyEmbeddings] Background taxonomy check failed:', err.message);
|
|
|
|
|
} finally {
|
|
|
|
|
this._regenerating = false;
|
|
|
|
|
}
|
|
|
|
|
}, intervalMs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
stopBackgroundCheck() {
|
|
|
|
|
if (this._checkInterval) {
|
|
|
|
|
clearInterval(this._checkInterval);
|
|
|
|
|
this._checkInterval = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Find similar categories for a product embedding
|
|
|
|
|
*/
|
|
|
|
|
@@ -176,29 +230,74 @@ class TaxonomyEmbeddings {
|
|
|
|
|
// Private Methods
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
async _fetchCategories(connection) {
|
|
|
|
|
// Fetch hierarchical categories (types 10-13)
|
|
|
|
|
const [rows] = await connection.query(`
|
|
|
|
|
SELECT cat_id, name, master_cat_id, type
|
|
|
|
|
FROM product_categories
|
|
|
|
|
WHERE type IN (10, 11, 12, 13)
|
|
|
|
|
ORDER BY type, name
|
|
|
|
|
`);
|
|
|
|
|
/**
|
|
|
|
|
* Fetch minimal raw rows from MySQL — used for content hash computation.
|
|
|
|
|
* This is the cheap path: no path-building, no embeddings, just the raw data.
|
|
|
|
|
*/
|
|
|
|
|
async _fetchRawRows(connection) {
|
|
|
|
|
const [[catRows], [themeRows], [colorRows]] = await Promise.all([
|
|
|
|
|
connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (10, 11, 12, 13) ORDER BY cat_id'),
|
|
|
|
|
connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (20, 21) ORDER BY cat_id'),
|
|
|
|
|
connection.query('SELECT color, name, hex_color FROM product_color_list ORDER BY `order`')
|
|
|
|
|
]);
|
|
|
|
|
return { catRows, themeRows, colorRows };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Build lookup for hierarchy
|
|
|
|
|
/**
|
|
|
|
|
* Compute a stable SHA-256 hash of the taxonomy row content.
|
|
|
|
|
* Any change to IDs, names, or parent relationships will produce a different hash.
|
|
|
|
|
*/
|
|
|
|
|
_computeContentHash({ catRows, themeRows, colorRows }) {
|
|
|
|
|
const content = JSON.stringify({
|
|
|
|
|
cats: catRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]),
|
|
|
|
|
themes: themeRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]),
|
|
|
|
|
colors: colorRows.map(r => [r.color, r.name]).sort()
|
|
|
|
|
});
|
|
|
|
|
return crypto.createHash('sha256').update(content).digest('hex').slice(0, 16);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Build full taxonomy objects and generate embeddings, then atomically swap
|
|
|
|
|
* the in-memory state. Called on cache miss and on background change detection.
|
|
|
|
|
*/
|
|
|
|
|
async _buildAndEmbed(rawRows, contentHash) {
|
|
|
|
|
const { catRows, themeRows, colorRows } = rawRows;
|
|
|
|
|
|
|
|
|
|
const categories = this._buildCategories(catRows);
|
|
|
|
|
const themes = this._buildThemes(themeRows);
|
|
|
|
|
const colors = this._buildColors(colorRows);
|
|
|
|
|
|
|
|
|
|
this.logger.info(`[TaxonomyEmbeddings] Generating embeddings for ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
|
|
|
|
|
|
|
|
|
|
const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
|
|
|
|
|
this._generateEmbeddings(categories, 'categories'),
|
|
|
|
|
this._generateEmbeddings(themes, 'themes'),
|
|
|
|
|
this._generateEmbeddings(colors, 'colors')
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
// Atomic in-memory swap (single-threaded JS — readers always see a consistent state)
|
|
|
|
|
this.categories = catEmbeddings;
|
|
|
|
|
this.themes = themeEmbeddings;
|
|
|
|
|
this.colors = colorEmbeddings;
|
|
|
|
|
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
|
|
|
|
|
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
|
|
|
|
|
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
|
|
|
|
|
this.contentHash = contentHash;
|
|
|
|
|
|
|
|
|
|
this._saveCache();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_buildCategories(rows) {
|
|
|
|
|
const byId = new Map(rows.map(r => [r.cat_id, r]));
|
|
|
|
|
|
|
|
|
|
// Find IDs of excluded top-level categories and all their descendants
|
|
|
|
|
const excludedIds = new Set();
|
|
|
|
|
|
|
|
|
|
// First pass: find excluded top-level categories
|
|
|
|
|
for (const row of rows) {
|
|
|
|
|
if (row.type === 10 && EXCLUDED_CATEGORY_NAMES.includes(row.name.toLowerCase())) {
|
|
|
|
|
excludedIds.add(row.cat_id);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Multiple passes to find all descendants
|
|
|
|
|
// Multiple passes to find all descendants of excluded categories
|
|
|
|
|
let foundNew = true;
|
|
|
|
|
while (foundNew) {
|
|
|
|
|
foundNew = false;
|
|
|
|
|
@@ -212,20 +311,14 @@ class TaxonomyEmbeddings {
|
|
|
|
|
|
|
|
|
|
this.logger.info(`[TaxonomyEmbeddings] Excluding ${excludedIds.size} categories (Black Friday, Gifts, Deals and children)`);
|
|
|
|
|
|
|
|
|
|
// Build category objects with full paths, excluding filtered ones
|
|
|
|
|
const categories = [];
|
|
|
|
|
|
|
|
|
|
for (const row of rows) {
|
|
|
|
|
if (excludedIds.has(row.cat_id)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (excludedIds.has(row.cat_id)) continue;
|
|
|
|
|
|
|
|
|
|
const path = [];
|
|
|
|
|
const pathParts = [];
|
|
|
|
|
let current = row;
|
|
|
|
|
|
|
|
|
|
// Walk up the tree to build full path
|
|
|
|
|
while (current) {
|
|
|
|
|
path.unshift(current.name);
|
|
|
|
|
pathParts.unshift(current.name);
|
|
|
|
|
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -234,55 +327,37 @@ class TaxonomyEmbeddings {
|
|
|
|
|
name: row.name,
|
|
|
|
|
parentId: row.master_cat_id,
|
|
|
|
|
type: row.type,
|
|
|
|
|
fullPath: path.join(' > '),
|
|
|
|
|
embeddingText: path.join(' ')
|
|
|
|
|
fullPath: pathParts.join(' > '),
|
|
|
|
|
embeddingText: pathParts.join(' ')
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return categories;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async _fetchThemes(connection) {
|
|
|
|
|
// Fetch themes (types 20-21)
|
|
|
|
|
const [rows] = await connection.query(`
|
|
|
|
|
SELECT cat_id, name, master_cat_id, type
|
|
|
|
|
FROM product_categories
|
|
|
|
|
WHERE type IN (20, 21)
|
|
|
|
|
ORDER BY type, name
|
|
|
|
|
`);
|
|
|
|
|
|
|
|
|
|
_buildThemes(rows) {
|
|
|
|
|
const byId = new Map(rows.map(r => [r.cat_id, r]));
|
|
|
|
|
const themes = [];
|
|
|
|
|
|
|
|
|
|
for (const row of rows) {
|
|
|
|
|
const path = [];
|
|
|
|
|
return rows.map(row => {
|
|
|
|
|
const pathParts = [];
|
|
|
|
|
let current = row;
|
|
|
|
|
|
|
|
|
|
while (current) {
|
|
|
|
|
path.unshift(current.name);
|
|
|
|
|
pathParts.unshift(current.name);
|
|
|
|
|
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
themes.push({
|
|
|
|
|
return {
|
|
|
|
|
id: row.cat_id,
|
|
|
|
|
name: row.name,
|
|
|
|
|
parentId: row.master_cat_id,
|
|
|
|
|
type: row.type,
|
|
|
|
|
fullPath: path.join(' > '),
|
|
|
|
|
embeddingText: path.join(' ')
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return themes;
|
|
|
|
|
fullPath: pathParts.join(' > '),
|
|
|
|
|
embeddingText: pathParts.join(' ')
|
|
|
|
|
};
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async _fetchColors(connection) {
|
|
|
|
|
const [rows] = await connection.query(`
|
|
|
|
|
SELECT color, name, hex_color
|
|
|
|
|
FROM product_color_list
|
|
|
|
|
ORDER BY \`order\`
|
|
|
|
|
`);
|
|
|
|
|
|
|
|
|
|
_buildColors(rows) {
|
|
|
|
|
return rows.map(row => ({
|
|
|
|
|
id: row.color,
|
|
|
|
|
name: row.name,
|
|
|
|
|
@@ -301,9 +376,7 @@ class TaxonomyEmbeddings {
|
|
|
|
|
const results = [...items];
|
|
|
|
|
|
|
|
|
|
// Process in batches
|
|
|
|
|
let batchNum = 0;
|
|
|
|
|
for await (const chunk of this.provider.embedBatchChunked(texts, { batchSize: 100 })) {
|
|
|
|
|
batchNum++;
|
|
|
|
|
for (let i = 0; i < chunk.embeddings.length; i++) {
|
|
|
|
|
const globalIndex = chunk.startIndex + i;
|
|
|
|
|
results[globalIndex] = {
|
|
|
|
|
@@ -318,6 +391,43 @@ class TaxonomyEmbeddings {
|
|
|
|
|
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Disk Cache Methods
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
_loadCache() {
|
|
|
|
|
try {
|
|
|
|
|
if (!fs.existsSync(CACHE_PATH)) return null;
|
|
|
|
|
|
|
|
|
|
const data = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
|
|
|
|
|
if (!data.contentHash || !data.categories?.length || !data.themes?.length || !data.colors?.length) {
|
|
|
|
|
this.logger.warn('[TaxonomyEmbeddings] Disk cache malformed or missing content hash, will regenerate');
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return data;
|
|
|
|
|
} catch (err) {
|
|
|
|
|
this.logger.warn('[TaxonomyEmbeddings] Failed to load disk cache:', err.message);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_saveCache() {
|
|
|
|
|
try {
|
|
|
|
|
fs.mkdirSync(path.dirname(CACHE_PATH), { recursive: true });
|
|
|
|
|
fs.writeFileSync(CACHE_PATH, JSON.stringify({
|
|
|
|
|
generatedAt: new Date().toISOString(),
|
|
|
|
|
contentHash: this.contentHash,
|
|
|
|
|
categories: this.categories,
|
|
|
|
|
themes: this.themes,
|
|
|
|
|
colors: this.colors,
|
|
|
|
|
}));
|
|
|
|
|
this.logger.info(`[TaxonomyEmbeddings] Disk cache saved to ${CACHE_PATH}`);
|
|
|
|
|
} catch (err) {
|
|
|
|
|
this.logger.warn('[TaxonomyEmbeddings] Failed to save disk cache:', err.message);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module.exports = { TaxonomyEmbeddings };
|
|
|
|
|
|