/** * Taxonomy Embedding Service * * Generates and caches embeddings for categories, themes, and colors. * Excludes "Black Friday", "Gifts", "Deals" categories and their children. * * Disk cache: embeddings are saved to data/taxonomy-embeddings.json and reused * across server restarts. Cache is invalidated by content hash — if the taxonomy * rows in MySQL change, the next check will detect it and regenerate automatically. * * Background check: after initialization, call startBackgroundCheck(getConnectionFn) * to poll for taxonomy changes on a configurable interval (default 1h). */ const fs = require('fs'); const path = require('path'); const crypto = require('crypto'); const { findTopMatches } = require('./similarity'); // Categories to exclude (and all their children) const EXCLUDED_CATEGORY_NAMES = ['black friday', 'gifts', 'deals']; // Disk cache config const CACHE_PATH = path.join(__dirname, '..', '..', '..', '..', 'data', 'taxonomy-embeddings.json'); class TaxonomyEmbeddings { constructor({ provider, logger }) { this.provider = provider; this.logger = logger || console; // Cached taxonomy with embeddings this.categories = []; this.themes = []; this.colors = []; // Raw data without embeddings (for lookup) this.categoryMap = new Map(); this.themeMap = new Map(); this.colorMap = new Map(); // Content hash of the last successfully built taxonomy (from DB rows) this.contentHash = null; this.initialized = false; this.initializing = false; this._checkInterval = null; this._regenerating = false; } /** * Initialize embeddings — fetches raw taxonomy rows to compute a content hash, * then either loads the matching disk cache or generates fresh embeddings. */ async initialize(connection) { if (this.initialized) { return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length }; } if (this.initializing) { // Wait for existing initialization while (this.initializing) { await new Promise(resolve => setTimeout(resolve, 100)); } return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length }; } this.initializing = true; try { // Always fetch raw rows first — cheap (~10ms), no OpenAI calls. // Used to compute a content hash for cache validation. const rawRows = await this._fetchRawRows(connection); const freshHash = this._computeContentHash(rawRows); const cached = this._loadCache(); if (cached && cached.contentHash === freshHash) { this.categories = cached.categories; this.themes = cached.themes; this.colors = cached.colors; this.categoryMap = new Map(this.categories.map(c => [c.id, c])); this.themeMap = new Map(this.themes.map(t => [t.id, t])); this.colorMap = new Map(this.colors.map(c => [c.id, c])); this.contentHash = freshHash; this.initialized = true; this.logger.info(`[TaxonomyEmbeddings] Loaded from cache: ${this.categories.length} categories, ${this.themes.length} themes, ${this.colors.length} colors`); return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length }; } if (cached) { this.logger.info('[TaxonomyEmbeddings] Taxonomy changed since cache was built, regenerating...'); } else { this.logger.info('[TaxonomyEmbeddings] No cache — fetching taxonomy and generating embeddings...'); } await this._buildAndEmbed(rawRows, freshHash); this.initialized = true; this.logger.info('[TaxonomyEmbeddings] Initialization complete'); return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length }; } catch (error) { this.logger.error('[TaxonomyEmbeddings] Initialization failed:', error); throw error; } finally { this.initializing = false; } } /** * Start a background interval that checks for taxonomy changes and regenerates * embeddings automatically if the content hash differs. * * @param {Function} getConnectionFn - async function returning { connection } * @param {number} intervalMs - check interval, default 1 hour */ startBackgroundCheck(getConnectionFn, intervalMs = 60 * 60 * 1000) { if (this._checkInterval) return; this.logger.info(`[TaxonomyEmbeddings] Background taxonomy check started (every ${intervalMs / 60000} min)`); this._checkInterval = setInterval(async () => { if (this._regenerating) return; try { const { connection } = await getConnectionFn(); const rawRows = await this._fetchRawRows(connection); const freshHash = this._computeContentHash(rawRows); if (freshHash === this.contentHash) return; this.logger.info('[TaxonomyEmbeddings] Taxonomy changed, regenerating embeddings in background...'); this._regenerating = true; await this._buildAndEmbed(rawRows, freshHash); this.logger.info('[TaxonomyEmbeddings] Background regeneration complete'); } catch (err) { this.logger.warn('[TaxonomyEmbeddings] Background taxonomy check failed:', err.message); } finally { this._regenerating = false; } }, intervalMs); } stopBackgroundCheck() { if (this._checkInterval) { clearInterval(this._checkInterval); this._checkInterval = null; } } /** * Find similar categories for a product embedding */ findSimilarCategories(productEmbedding, topK = 10) { if (!this.initialized || !productEmbedding) { return []; } const matches = findTopMatches(productEmbedding, this.categories, topK); return matches.map(match => { const cat = this.categoryMap.get(match.id); return { id: match.id, name: cat?.name || '', fullPath: cat?.fullPath || '', similarity: match.similarity }; }); } /** * Find similar themes for a product embedding */ findSimilarThemes(productEmbedding, topK = 5) { if (!this.initialized || !productEmbedding) { return []; } const matches = findTopMatches(productEmbedding, this.themes, topK); return matches.map(match => { const theme = this.themeMap.get(match.id); return { id: match.id, name: theme?.name || '', fullPath: theme?.fullPath || '', similarity: match.similarity }; }); } /** * Find similar colors for a product embedding */ findSimilarColors(productEmbedding, topK = 5) { if (!this.initialized || !productEmbedding) { return []; } const matches = findTopMatches(productEmbedding, this.colors, topK); return matches.map(match => { const color = this.colorMap.get(match.id); return { id: match.id, name: color?.name || '', similarity: match.similarity }; }); } /** * Get all taxonomy data (without embeddings) for frontend */ getTaxonomyData() { return { categories: this.categories.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })), themes: this.themes.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })), colors: this.colors.map(({ id, name }) => ({ id, name })) }; } /** * Check if service is ready */ isReady() { return this.initialized; } // ============================================================================ // Private Methods // ============================================================================ /** * Fetch minimal raw rows from MySQL — used for content hash computation. * This is the cheap path: no path-building, no embeddings, just the raw data. */ async _fetchRawRows(connection) { const [[catRows], [themeRows], [colorRows]] = await Promise.all([ connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (10, 11, 12, 13) ORDER BY cat_id'), connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (20, 21) ORDER BY cat_id'), connection.query('SELECT color, name, hex_color FROM product_color_list ORDER BY `order`') ]); return { catRows, themeRows, colorRows }; } /** * Compute a stable SHA-256 hash of the taxonomy row content. * Any change to IDs, names, or parent relationships will produce a different hash. */ _computeContentHash({ catRows, themeRows, colorRows }) { const content = JSON.stringify({ cats: catRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]), themes: themeRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]), colors: colorRows.map(r => [r.color, r.name]).sort() }); return crypto.createHash('sha256').update(content).digest('hex').slice(0, 16); } /** * Build full taxonomy objects and generate embeddings, then atomically swap * the in-memory state. Called on cache miss and on background change detection. */ async _buildAndEmbed(rawRows, contentHash) { const { catRows, themeRows, colorRows } = rawRows; const categories = this._buildCategories(catRows); const themes = this._buildThemes(themeRows); const colors = this._buildColors(colorRows); this.logger.info(`[TaxonomyEmbeddings] Generating embeddings for ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`); const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([ this._generateEmbeddings(categories, 'categories'), this._generateEmbeddings(themes, 'themes'), this._generateEmbeddings(colors, 'colors') ]); // Atomic in-memory swap (single-threaded JS — readers always see a consistent state) this.categories = catEmbeddings; this.themes = themeEmbeddings; this.colors = colorEmbeddings; this.categoryMap = new Map(this.categories.map(c => [c.id, c])); this.themeMap = new Map(this.themes.map(t => [t.id, t])); this.colorMap = new Map(this.colors.map(c => [c.id, c])); this.contentHash = contentHash; this._saveCache(); } _buildCategories(rows) { const byId = new Map(rows.map(r => [r.cat_id, r])); const excludedIds = new Set(); for (const row of rows) { if (row.type === 10 && EXCLUDED_CATEGORY_NAMES.includes(row.name.toLowerCase())) { excludedIds.add(row.cat_id); } } // Multiple passes to find all descendants of excluded categories let foundNew = true; while (foundNew) { foundNew = false; for (const row of rows) { if (!excludedIds.has(row.cat_id) && excludedIds.has(row.master_cat_id)) { excludedIds.add(row.cat_id); foundNew = true; } } } this.logger.info(`[TaxonomyEmbeddings] Excluding ${excludedIds.size} categories (Black Friday, Gifts, Deals and children)`); const categories = []; for (const row of rows) { if (excludedIds.has(row.cat_id)) continue; const pathParts = []; let current = row; while (current) { pathParts.unshift(current.name); current = current.master_cat_id ? byId.get(current.master_cat_id) : null; } categories.push({ id: row.cat_id, name: row.name, parentId: row.master_cat_id, type: row.type, fullPath: pathParts.join(' > '), embeddingText: pathParts.join(' ') }); } return categories; } _buildThemes(rows) { const byId = new Map(rows.map(r => [r.cat_id, r])); return rows.map(row => { const pathParts = []; let current = row; while (current) { pathParts.unshift(current.name); current = current.master_cat_id ? byId.get(current.master_cat_id) : null; } return { id: row.cat_id, name: row.name, parentId: row.master_cat_id, type: row.type, fullPath: pathParts.join(' > '), embeddingText: pathParts.join(' ') }; }); } _buildColors(rows) { return rows.map(row => ({ id: row.color, name: row.name, hexColor: row.hex_color, embeddingText: row.name })); } async _generateEmbeddings(items, label) { if (items.length === 0) { return items; } const startTime = Date.now(); const texts = items.map(item => item.embeddingText); const results = [...items]; // Process in batches for await (const chunk of this.provider.embedBatchChunked(texts, { batchSize: 100 })) { for (let i = 0; i < chunk.embeddings.length; i++) { const globalIndex = chunk.startIndex + i; results[globalIndex] = { ...results[globalIndex], embedding: chunk.embeddings[i] }; } } const elapsed = Date.now() - startTime; this.logger.info(`[TaxonomyEmbeddings] Generated ${items.length} ${label} embeddings in ${elapsed}ms`); return results; } // ============================================================================ // Disk Cache Methods // ============================================================================ _loadCache() { try { if (!fs.existsSync(CACHE_PATH)) return null; const data = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8')); if (!data.contentHash || !data.categories?.length || !data.themes?.length || !data.colors?.length) { this.logger.warn('[TaxonomyEmbeddings] Disk cache malformed or missing content hash, will regenerate'); return null; } return data; } catch (err) { this.logger.warn('[TaxonomyEmbeddings] Failed to load disk cache:', err.message); return null; } } _saveCache() { try { fs.mkdirSync(path.dirname(CACHE_PATH), { recursive: true }); fs.writeFileSync(CACHE_PATH, JSON.stringify({ generatedAt: new Date().toISOString(), contentHash: this.contentHash, categories: this.categories, themes: this.themes, colors: this.colors, })); this.logger.info(`[TaxonomyEmbeddings] Disk cache saved to ${CACHE_PATH}`); } catch (err) { this.logger.warn('[TaxonomyEmbeddings] Failed to save disk cache:', err.message); } } } module.exports = { TaxonomyEmbeddings };