Add category suggestions to product editor, deal with taxonomy embeddings better, fix category badge overflow

This commit is contained in:
2026-03-18 12:40:25 -04:00
parent 39b8faa208
commit 1b836567cd
9 changed files with 457 additions and 109 deletions

View File

@@ -51,6 +51,10 @@ async function ensureInitialized() {
...result.stats,
groqEnabled: result.groqEnabled
});
// Watch for taxonomy changes in the background (checks every hour)
aiService.startBackgroundCheck(getDbConnection);
return true;
} catch (error) {
console.error('[AI Routes] Failed to initialize AI service:', error);
@@ -431,4 +435,16 @@ router.post('/validate/sanity-check', async (req, res) => {
}
});
/**
* Kick off AI initialization in the background (no-op if already initialized).
* Call once from server startup so the taxonomy embeddings are ready before
* the first user request hits a taxonomy dropdown.
*/
function initInBackground() {
ensureInitialized().catch(err =>
console.error('[AI Routes] Background initialization failed:', err)
);
}
module.exports = router;
module.exports.initInBackground = initInBackground;

View File

@@ -162,6 +162,8 @@ async function startServer() {
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`[Server] Running in ${process.env.NODE_ENV || 'development'} mode on port ${PORT}`);
// Pre-warm AI service so taxonomy embeddings are ready before first user request
aiRouter.initInBackground();
});
} catch (error) {
console.error('Failed to start server:', error);

View File

@@ -3,13 +3,26 @@
*
* Generates and caches embeddings for categories, themes, and colors.
* Excludes "Black Friday", "Gifts", "Deals" categories and their children.
*
* Disk cache: embeddings are saved to data/taxonomy-embeddings.json and reused
* across server restarts. Cache is invalidated by content hash — if the taxonomy
* rows in MySQL change, the next check will detect it and regenerate automatically.
*
* Background check: after initialization, call startBackgroundCheck(getConnectionFn)
* to poll for taxonomy changes on a configurable interval (default 1h).
*/
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const { findTopMatches } = require('./similarity');
// Categories to exclude (and all their children)
const EXCLUDED_CATEGORY_NAMES = ['black friday', 'gifts', 'deals'];
// Disk cache config
const CACHE_PATH = path.join(__dirname, '..', '..', '..', '..', 'data', 'taxonomy-embeddings.json');
class TaxonomyEmbeddings {
constructor({ provider, logger }) {
this.provider = provider;
@@ -25,12 +38,18 @@ class TaxonomyEmbeddings {
this.themeMap = new Map();
this.colorMap = new Map();
// Content hash of the last successfully built taxonomy (from DB rows)
this.contentHash = null;
this.initialized = false;
this.initializing = false;
this._checkInterval = null;
this._regenerating = false;
}
/**
* Initialize embeddings - fetch taxonomy and generate embeddings
* Initialize embeddings fetches raw taxonomy rows to compute a content hash,
* then either loads the matching disk cache or generates fresh embeddings.
*/
async initialize(connection) {
if (this.initialized) {
@@ -48,42 +67,36 @@ class TaxonomyEmbeddings {
this.initializing = true;
try {
this.logger.info('[TaxonomyEmbeddings] Starting initialization...');
// Always fetch raw rows first — cheap (~10ms), no OpenAI calls.
// Used to compute a content hash for cache validation.
const rawRows = await this._fetchRawRows(connection);
const freshHash = this._computeContentHash(rawRows);
// Fetch raw taxonomy data
const [categories, themes, colors] = await Promise.all([
this._fetchCategories(connection),
this._fetchThemes(connection),
this._fetchColors(connection)
]);
const cached = this._loadCache();
if (cached && cached.contentHash === freshHash) {
this.categories = cached.categories;
this.themes = cached.themes;
this.colors = cached.colors;
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
this.contentHash = freshHash;
this.initialized = true;
this.logger.info(`[TaxonomyEmbeddings] Loaded from cache: ${this.categories.length} categories, ${this.themes.length} themes, ${this.colors.length} colors`);
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
}
this.logger.info(`[TaxonomyEmbeddings] Fetched ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
// Generate embeddings in parallel
const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
this._generateEmbeddings(categories, 'categories'),
this._generateEmbeddings(themes, 'themes'),
this._generateEmbeddings(colors, 'colors')
]);
// Store with embeddings
this.categories = catEmbeddings;
this.themes = themeEmbeddings;
this.colors = colorEmbeddings;
// Build lookup maps
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
if (cached) {
this.logger.info('[TaxonomyEmbeddings] Taxonomy changed since cache was built, regenerating...');
} else {
this.logger.info('[TaxonomyEmbeddings] No cache — fetching taxonomy and generating embeddings...');
}
await this._buildAndEmbed(rawRows, freshHash);
this.initialized = true;
this.logger.info('[TaxonomyEmbeddings] Initialization complete');
return {
categories: this.categories.length,
themes: this.themes.length,
colors: this.colors.length
};
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
} catch (error) {
this.logger.error('[TaxonomyEmbeddings] Initialization failed:', error);
throw error;
@@ -92,6 +105,47 @@ class TaxonomyEmbeddings {
}
}
/**
* Start a background interval that checks for taxonomy changes and regenerates
* embeddings automatically if the content hash differs.
*
* @param {Function} getConnectionFn - async function returning { connection }
* @param {number} intervalMs - check interval, default 1 hour
*/
startBackgroundCheck(getConnectionFn, intervalMs = 60 * 60 * 1000) {
if (this._checkInterval) return;
this.logger.info(`[TaxonomyEmbeddings] Background taxonomy check started (every ${intervalMs / 60000} min)`);
this._checkInterval = setInterval(async () => {
if (this._regenerating) return;
try {
const { connection } = await getConnectionFn();
const rawRows = await this._fetchRawRows(connection);
const freshHash = this._computeContentHash(rawRows);
if (freshHash === this.contentHash) return;
this.logger.info('[TaxonomyEmbeddings] Taxonomy changed, regenerating embeddings in background...');
this._regenerating = true;
await this._buildAndEmbed(rawRows, freshHash);
this.logger.info('[TaxonomyEmbeddings] Background regeneration complete');
} catch (err) {
this.logger.warn('[TaxonomyEmbeddings] Background taxonomy check failed:', err.message);
} finally {
this._regenerating = false;
}
}, intervalMs);
}
stopBackgroundCheck() {
if (this._checkInterval) {
clearInterval(this._checkInterval);
this._checkInterval = null;
}
}
/**
* Find similar categories for a product embedding
*/
@@ -176,29 +230,74 @@ class TaxonomyEmbeddings {
// Private Methods
// ============================================================================
async _fetchCategories(connection) {
// Fetch hierarchical categories (types 10-13)
const [rows] = await connection.query(`
SELECT cat_id, name, master_cat_id, type
FROM product_categories
WHERE type IN (10, 11, 12, 13)
ORDER BY type, name
`);
/**
* Fetch minimal raw rows from MySQL — used for content hash computation.
* This is the cheap path: no path-building, no embeddings, just the raw data.
*/
async _fetchRawRows(connection) {
const [[catRows], [themeRows], [colorRows]] = await Promise.all([
connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (10, 11, 12, 13) ORDER BY cat_id'),
connection.query('SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (20, 21) ORDER BY cat_id'),
connection.query('SELECT color, name, hex_color FROM product_color_list ORDER BY `order`')
]);
return { catRows, themeRows, colorRows };
}
// Build lookup for hierarchy
/**
* Compute a stable SHA-256 hash of the taxonomy row content.
* Any change to IDs, names, or parent relationships will produce a different hash.
*/
_computeContentHash({ catRows, themeRows, colorRows }) {
const content = JSON.stringify({
cats: catRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]),
themes: themeRows.map(r => [r.cat_id, r.name, r.master_cat_id]).sort((a, b) => a[0] - b[0]),
colors: colorRows.map(r => [r.color, r.name]).sort()
});
return crypto.createHash('sha256').update(content).digest('hex').slice(0, 16);
}
/**
* Build full taxonomy objects and generate embeddings, then atomically swap
* the in-memory state. Called on cache miss and on background change detection.
*/
async _buildAndEmbed(rawRows, contentHash) {
const { catRows, themeRows, colorRows } = rawRows;
const categories = this._buildCategories(catRows);
const themes = this._buildThemes(themeRows);
const colors = this._buildColors(colorRows);
this.logger.info(`[TaxonomyEmbeddings] Generating embeddings for ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
this._generateEmbeddings(categories, 'categories'),
this._generateEmbeddings(themes, 'themes'),
this._generateEmbeddings(colors, 'colors')
]);
// Atomic in-memory swap (single-threaded JS — readers always see a consistent state)
this.categories = catEmbeddings;
this.themes = themeEmbeddings;
this.colors = colorEmbeddings;
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
this.contentHash = contentHash;
this._saveCache();
}
_buildCategories(rows) {
const byId = new Map(rows.map(r => [r.cat_id, r]));
// Find IDs of excluded top-level categories and all their descendants
const excludedIds = new Set();
// First pass: find excluded top-level categories
for (const row of rows) {
if (row.type === 10 && EXCLUDED_CATEGORY_NAMES.includes(row.name.toLowerCase())) {
excludedIds.add(row.cat_id);
}
}
// Multiple passes to find all descendants
// Multiple passes to find all descendants of excluded categories
let foundNew = true;
while (foundNew) {
foundNew = false;
@@ -212,20 +311,14 @@ class TaxonomyEmbeddings {
this.logger.info(`[TaxonomyEmbeddings] Excluding ${excludedIds.size} categories (Black Friday, Gifts, Deals and children)`);
// Build category objects with full paths, excluding filtered ones
const categories = [];
for (const row of rows) {
if (excludedIds.has(row.cat_id)) {
continue;
}
if (excludedIds.has(row.cat_id)) continue;
const path = [];
const pathParts = [];
let current = row;
// Walk up the tree to build full path
while (current) {
path.unshift(current.name);
pathParts.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
@@ -234,55 +327,37 @@ class TaxonomyEmbeddings {
name: row.name,
parentId: row.master_cat_id,
type: row.type,
fullPath: path.join(' > '),
embeddingText: path.join(' ')
fullPath: pathParts.join(' > '),
embeddingText: pathParts.join(' ')
});
}
return categories;
}
async _fetchThemes(connection) {
// Fetch themes (types 20-21)
const [rows] = await connection.query(`
SELECT cat_id, name, master_cat_id, type
FROM product_categories
WHERE type IN (20, 21)
ORDER BY type, name
`);
_buildThemes(rows) {
const byId = new Map(rows.map(r => [r.cat_id, r]));
const themes = [];
for (const row of rows) {
const path = [];
return rows.map(row => {
const pathParts = [];
let current = row;
while (current) {
path.unshift(current.name);
pathParts.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
themes.push({
return {
id: row.cat_id,
name: row.name,
parentId: row.master_cat_id,
type: row.type,
fullPath: path.join(' > '),
embeddingText: path.join(' ')
});
}
return themes;
fullPath: pathParts.join(' > '),
embeddingText: pathParts.join(' ')
};
});
}
async _fetchColors(connection) {
const [rows] = await connection.query(`
SELECT color, name, hex_color
FROM product_color_list
ORDER BY \`order\`
`);
_buildColors(rows) {
return rows.map(row => ({
id: row.color,
name: row.name,
@@ -301,9 +376,7 @@ class TaxonomyEmbeddings {
const results = [...items];
// Process in batches
let batchNum = 0;
for await (const chunk of this.provider.embedBatchChunked(texts, { batchSize: 100 })) {
batchNum++;
for (let i = 0; i < chunk.embeddings.length; i++) {
const globalIndex = chunk.startIndex + i;
results[globalIndex] = {
@@ -318,6 +391,43 @@ class TaxonomyEmbeddings {
return results;
}
// ============================================================================
// Disk Cache Methods
// ============================================================================
_loadCache() {
try {
if (!fs.existsSync(CACHE_PATH)) return null;
const data = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
if (!data.contentHash || !data.categories?.length || !data.themes?.length || !data.colors?.length) {
this.logger.warn('[TaxonomyEmbeddings] Disk cache malformed or missing content hash, will regenerate');
return null;
}
return data;
} catch (err) {
this.logger.warn('[TaxonomyEmbeddings] Failed to load disk cache:', err.message);
return null;
}
}
_saveCache() {
try {
fs.mkdirSync(path.dirname(CACHE_PATH), { recursive: true });
fs.writeFileSync(CACHE_PATH, JSON.stringify({
generatedAt: new Date().toISOString(),
contentHash: this.contentHash,
categories: this.categories,
themes: this.themes,
colors: this.colors,
}));
this.logger.info(`[TaxonomyEmbeddings] Disk cache saved to ${CACHE_PATH}`);
} catch (err) {
this.logger.warn('[TaxonomyEmbeddings] Failed to save disk cache:', err.message);
}
}
}
module.exports = { TaxonomyEmbeddings };

View File

@@ -124,6 +124,17 @@ function isReady() {
return initialized && taxonomyEmbeddings?.isReady();
}
/**
* Start background taxonomy change detection.
* Call once after initialization, passing a function that returns { connection }.
* @param {Function} getConnectionFn
* @param {number} [intervalMs] - default 1 hour
*/
function startBackgroundCheck(getConnectionFn, intervalMs) {
if (!initialized || !taxonomyEmbeddings) return;
taxonomyEmbeddings.startBackgroundCheck(getConnectionFn, intervalMs);
}
/**
* Build weighted product text for embedding.
* Weights the product name heavily by repeating it, and truncates long descriptions
@@ -362,6 +373,7 @@ module.exports = {
initialize,
isReady,
getStatus,
startBackgroundCheck,
// Embeddings (OpenAI)
getProductEmbedding,