Files
inventory/inventory-server/src/services/ai/embeddings/taxonomyEmbeddings.js

434 lines
14 KiB
JavaScript

/**
* Taxonomy Embedding Service
*
* Generates and caches embeddings for categories, themes, and colors.
* Excludes "Black Friday", "Gifts", "Deals" categories and their children.
*
* Disk cache: embeddings are saved to data/taxonomy-embeddings.json and reused
* across server restarts. Cache is invalidated by content hash — if the taxonomy
* rows in MySQL change, the next check will detect it and regenerate automatically.
*
* Background check: after initialization, call startBackgroundCheck(getConnectionFn)
* to poll for taxonomy changes on a configurable interval (default 1h).
*/
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const { findTopMatches } = require('./similarity');
// Categories to exclude (and all their children)
const EXCLUDED_CATEGORY_NAMES = ['black friday', 'gifts', 'deals'];
// Disk cache config
const CACHE_PATH = path.join(__dirname, '..', '..', '..', '..', 'data', 'taxonomy-embeddings.json');
class TaxonomyEmbeddings {
constructor({ provider, logger }) {
this.provider = provider;
this.logger = logger || console;
// Cached taxonomy with embeddings
this.categories = [];
this.themes = [];
this.colors = [];
// Raw data without embeddings (for lookup)
this.categoryMap = new Map();
this.themeMap = new Map();
this.colorMap = new Map();
// Content hash of the last successfully built taxonomy (from DB rows)
this.contentHash = null;
this.initialized = false;
this.initializing = false;
this._checkInterval = null;
this._regenerating = false;
}
/**
* Initialize embeddings — fetches raw taxonomy rows to compute a content hash,
* then either loads the matching disk cache or generates fresh embeddings.
*/
async initialize(connection) {
if (this.initialized) {
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
}
if (this.initializing) {
// Wait for existing initialization
while (this.initializing) {
await new Promise(resolve => setTimeout(resolve, 100));
}
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
}
this.initializing = true;
try {
// Always fetch raw rows first — cheap (~10ms), no OpenAI calls.
// Used to compute a content hash for cache validation.
const rawRows = await this._fetchRawRows(connection);
const freshHash = this._computeContentHash(rawRows);
const cached = this._loadCache();
if (cached && cached.contentHash === freshHash) {
this.categories = cached.categories;
this.themes = cached.themes;
this.colors = cached.colors;
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
this.contentHash = freshHash;
this.initialized = true;
this.logger.info(`[TaxonomyEmbeddings] Loaded from cache: ${this.categories.length} categories, ${this.themes.length} themes, ${this.colors.length} colors`);
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
}
if (cached) {
this.logger.info('[TaxonomyEmbeddings] Taxonomy changed since cache was built, regenerating...');
} else {
this.logger.info('[TaxonomyEmbeddings] No cache — fetching taxonomy and generating embeddings...');
}
await this._buildAndEmbed(rawRows, freshHash);
this.initialized = true;
this.logger.info('[TaxonomyEmbeddings] Initialization complete');
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
} catch (error) {
this.logger.error('[TaxonomyEmbeddings] Initialization failed:', error);
throw error;
} finally {
this.initializing = false;
}
}
/**
* Start a background interval that checks for taxonomy changes and regenerates
* embeddings automatically if the content hash differs.
*
* @param {Function} getConnectionFn - async function returning { connection }
* @param {number} intervalMs - check interval, default 1 hour
*/
startBackgroundCheck(getConnectionFn, intervalMs = 60 * 60 * 1000) {
if (this._checkInterval) return;
this.logger.info(`[TaxonomyEmbeddings] Background taxonomy check started (every ${intervalMs / 60000} min)`);
this._checkInterval = setInterval(async () => {
if (this._regenerating) return;
try {
const { connection } = await getConnectionFn();
const rawRows = await this._fetchRawRows(connection);
const freshHash = this._computeContentHash(rawRows);
if (freshHash === this.contentHash) return;
this.logger.info('[TaxonomyEmbeddings] Taxonomy changed, regenerating embeddings in background...');
this._regenerating = true;
await this._buildAndEmbed(rawRows, freshHash);
this.logger.info('[TaxonomyEmbeddings] Background regeneration complete');
} catch (err) {
this.logger.warn('[TaxonomyEmbeddings] Background taxonomy check failed:', err.message);
} finally {
this._regenerating = false;
}
}, intervalMs);
}
stopBackgroundCheck() {
if (this._checkInterval) {
clearInterval(this._checkInterval);
this._checkInterval = null;
}
}
/**
* Find similar categories for a product embedding
*/
findSimilarCategories(productEmbedding, topK = 10) {
if (!this.initialized || !productEmbedding) {
return [];
}
const matches = findTopMatches(productEmbedding, this.categories, topK);
return matches.map(match => {
const cat = this.categoryMap.get(match.id);
return {
id: match.id,
name: cat?.name || '',
fullPath: cat?.fullPath || '',
similarity: match.similarity
};
});
}
/**
* Find similar themes for a product embedding
*/
findSimilarThemes(productEmbedding, topK = 5) {
if (!this.initialized || !productEmbedding) {
return [];
}
const matches = findTopMatches(productEmbedding, this.themes, topK);
return matches.map(match => {
const theme = this.themeMap.get(match.id);
return {
id: match.id,
name: theme?.name || '',
fullPath: theme?.fullPath || '',
similarity: match.similarity
};
});
}
/**
* Find similar colors for a product embedding
*/
findSimilarColors(productEmbedding, topK = 5) {
if (!this.initialized || !productEmbedding) {
return [];
}
const matches = findTopMatches(productEmbedding, this.colors, topK);
return matches.map(match => {
const color = this.colorMap.get(match.id);
return {
id: match.id,
name: color?.name || '',
similarity: match.similarity
};
});
}
/**
* Get all taxonomy data (without embeddings) for frontend
*/
getTaxonomyData() {
return {
categories: this.categories.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })),
themes: this.themes.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })),
colors: this.colors.map(({ id, name }) => ({ id, name }))
};
}
/**
* Check if service is ready
*/
isReady() {
return this.initialized;
}
// ============================================================================
// Private Methods
// ============================================================================
/**
* Fetch minimal raw rows from MySQL — used for content hash computation.
* This is the cheap path: no path-building, no embeddings, just the raw data.
*/
async _fetchRawRows(connection) {
const [[catRows], [themeRows], [colorRows]] = await Promise.all([
connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (10, 11, 12, 13) ORDER BY cat_id'),
connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (20, 21) ORDER BY cat_id'),
connection.query('SELECT color, name, hex_color FROM product_color_list ORDER BY `order`')
]);
return { catRows, themeRows, colorRows };
}
/**
* Compute a stable SHA-256 hash of the taxonomy row content.
* Any change to IDs, names, or parent relationships will produce a different hash.
*/
_computeContentHash({ catRows, themeRows, colorRows }) {
const content = JSON.stringify({
cats: catRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]),
themes: themeRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]),
colors: colorRows.map(r => [r.color, r.name]).sort()
});
return crypto.createHash('sha256').update(content).digest('hex').slice(0, 16);
}
/**
* Build full taxonomy objects and generate embeddings, then atomically swap
* the in-memory state. Called on cache miss and on background change detection.
*/
async _buildAndEmbed(rawRows, contentHash) {
const { catRows, themeRows, colorRows } = rawRows;
const categories = this._buildCategories(catRows);
const themes = this._buildThemes(themeRows);
const colors = this._buildColors(colorRows);
this.logger.info(`[TaxonomyEmbeddings] Generating embeddings for ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
this._generateEmbeddings(categories, 'categories'),
this._generateEmbeddings(themes, 'themes'),
this._generateEmbeddings(colors, 'colors')
]);
// Atomic in-memory swap (single-threaded JS — readers always see a consistent state)
this.categories = catEmbeddings;
this.themes = themeEmbeddings;
this.colors = colorEmbeddings;
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
this.contentHash = contentHash;
this._saveCache();
}
_buildCategories(rows) {
const byId = new Map(rows.map(r => [r.cat_id, r]));
const excludedIds = new Set();
for (const row of rows) {
if (row.type === 10 && EXCLUDED_CATEGORY_NAMES.includes(row.name.toLowerCase())) {
excludedIds.add(row.cat_id);
}
}
// Multiple passes to find all descendants of excluded categories
let foundNew = true;
while (foundNew) {
foundNew = false;
for (const row of rows) {
if (!excludedIds.has(row.cat_id) && excludedIds.has(row.master_cat_id)) {
excludedIds.add(row.cat_id);
foundNew = true;
}
}
}
this.logger.info(`[TaxonomyEmbeddings] Excluding ${excludedIds.size} categories (Black Friday, Gifts, Deals and children)`);
const categories = [];
for (const row of rows) {
if (excludedIds.has(row.cat_id)) continue;
const pathParts = [];
let current = row;
while (current) {
pathParts.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
categories.push({
id: row.cat_id,
name: row.name,
parentId: row.master_cat_id,
type: row.type,
fullPath: pathParts.join(' > '),
embeddingText: pathParts.join(' ')
});
}
return categories;
}
_buildThemes(rows) {
const byId = new Map(rows.map(r => [r.cat_id, r]));
return rows.map(row => {
const pathParts = [];
let current = row;
while (current) {
pathParts.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
return {
id: row.cat_id,
name: row.name,
parentId: row.master_cat_id,
type: row.type,
fullPath: pathParts.join(' > '),
embeddingText: pathParts.join(' ')
};
});
}
_buildColors(rows) {
return rows.map(row => ({
id: row.color,
name: row.name,
hexColor: row.hex_color,
embeddingText: row.name
}));
}
async _generateEmbeddings(items, label) {
if (items.length === 0) {
return items;
}
const startTime = Date.now();
const texts = items.map(item => item.embeddingText);
const results = [...items];
// Process in batches
for await (const chunk of this.provider.embedBatchChunked(texts, { batchSize: 100 })) {
for (let i = 0; i < chunk.embeddings.length; i++) {
const globalIndex = chunk.startIndex + i;
results[globalIndex] = {
...results[globalIndex],
embedding: chunk.embeddings[i]
};
}
}
const elapsed = Date.now() - startTime;
this.logger.info(`[TaxonomyEmbeddings] Generated ${items.length} ${label} embeddings in ${elapsed}ms`);
return results;
}
// ============================================================================
// Disk Cache Methods
// ============================================================================
_loadCache() {
try {
if (!fs.existsSync(CACHE_PATH)) return null;
const data = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
if (!data.contentHash || !data.categories?.length || !data.themes?.length || !data.colors?.length) {
this.logger.warn('[TaxonomyEmbeddings] Disk cache malformed or missing content hash, will regenerate');
return null;
}
return data;
} catch (err) {
this.logger.warn('[TaxonomyEmbeddings] Failed to load disk cache:', err.message);
return null;
}
}
_saveCache() {
try {
fs.mkdirSync(path.dirname(CACHE_PATH), { recursive: true });
fs.writeFileSync(CACHE_PATH, JSON.stringify({
generatedAt: new Date().toISOString(),
contentHash: this.contentHash,
categories: this.categories,
themes: this.themes,
colors: this.colors,
}));
this.logger.info(`[TaxonomyEmbeddings] Disk cache saved to ${CACHE_PATH}`);
} catch (err) {
this.logger.warn('[TaxonomyEmbeddings] Failed to save disk cache:', err.message);
}
}
}
module.exports = { TaxonomyEmbeddings };