Add AI embeddings and suggestions for categories, a few validation step tweaks/fixes

This commit is contained in:
2026-01-19 11:34:55 -05:00
parent 9ce84fe5b9
commit 43d76e011d
20 changed files with 5311 additions and 176 deletions

View File

@@ -0,0 +1,283 @@
#!/usr/bin/env node
/**
* Embedding Proof-of-Concept Script
*
* Demonstrates how category embeddings work for product matching.
* Uses OpenAI text-embedding-3-small model.
*
* Usage: node scripts/embedding-poc.js
*/
const path = require('path');
require('dotenv').config({ path: path.join(__dirname, '../.env') });
const { getDbConnection, closeAllConnections } = require('../src/utils/dbConnection');
// ============================================================================
// Configuration
// ============================================================================
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
const EMBEDDING_MODEL = 'text-embedding-3-small';
const EMBEDDING_DIMENSIONS = 1536;
// Sample products to test (you can modify these)
const TEST_PRODUCTS = [
{
name: "Cosmos Infinity Chipboard - Stamperia",
description: "Laser-cut chipboard shapes featuring celestial designs for mixed media projects"
},
{
name: "Distress Oxide Ink Pad - Mermaid Lagoon",
description: "Water-reactive dye ink that creates an oxidized effect"
},
{
name: "Hedwig Puffy Stickers - Paper House Productions",
description: "3D puffy stickers featuring Harry Potter's owl Hedwig"
},
{
name: "Black Velvet Watercolor Brush Size 6",
description: "Round brush for watercolor painting with synthetic bristles"
},
{
name: "Floral Washi Tape Set",
description: "Decorative paper tape with flower patterns, pack of 6 rolls"
}
];
// ============================================================================
// OpenAI Embedding Functions
// ============================================================================
async function getEmbeddings(texts) {
const response = await fetch('https://api.openai.com/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${OPENAI_API_KEY}`
},
body: JSON.stringify({
input: texts.map(t => t.substring(0, 8000)), // Max 8k chars per text
model: EMBEDDING_MODEL,
dimensions: EMBEDDING_DIMENSIONS
})
});
if (!response.ok) {
const error = await response.json();
throw new Error(`OpenAI API error: ${error.error?.message || response.status}`);
}
const data = await response.json();
// Sort by index to ensure order matches input
const sorted = data.data.sort((a, b) => a.index - b.index);
return {
embeddings: sorted.map(item => item.embedding),
usage: data.usage,
model: data.model
};
}
// ============================================================================
// Vector Math
// ============================================================================
function cosineSimilarity(a, b) {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
function findTopMatches(queryEmbedding, categoryEmbeddings, topK = 10) {
const scored = categoryEmbeddings.map(cat => ({
...cat,
similarity: cosineSimilarity(queryEmbedding, cat.embedding)
}));
scored.sort((a, b) => b.similarity - a.similarity);
return scored.slice(0, topK);
}
// ============================================================================
// Database Functions
// ============================================================================
async function fetchCategories(connection) {
console.log('\n📂 Fetching categories from database...');
// Fetch hierarchical categories (types 10-13)
const [rows] = await connection.query(`
SELECT
cat_id,
name,
master_cat_id,
type
FROM product_categories
WHERE type IN (10, 11, 12, 13)
ORDER BY type, name
`);
console.log(` Found ${rows.length} category records`);
// Build category paths
const byId = new Map(rows.map(r => [r.cat_id, r]));
const categories = [];
for (const row of rows) {
const path = [];
let current = row;
// Walk up the tree to build full path
while (current) {
path.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
categories.push({
id: row.cat_id,
name: row.name,
type: row.type,
fullPath: path.join(' > '),
embeddingText: path.join(' ') // For embedding generation
});
}
// Count by level
const levels = {
10: categories.filter(c => c.type === 10).length,
11: categories.filter(c => c.type === 11).length,
12: categories.filter(c => c.type === 12).length,
13: categories.filter(c => c.type === 13).length,
};
console.log(` Level breakdown: ${levels[10]} top-level, ${levels[11]} L2, ${levels[12]} L3, ${levels[13]} L4`);
return categories;
}
// ============================================================================
// Main Script
// ============================================================================
async function main() {
console.log('═══════════════════════════════════════════════════════════════');
console.log(' EMBEDDING PROOF-OF-CONCEPT');
console.log(' Model: ' + EMBEDDING_MODEL);
console.log('═══════════════════════════════════════════════════════════════');
if (!OPENAI_API_KEY) {
console.error('❌ OPENAI_API_KEY not found in environment');
process.exit(1);
}
let connection;
try {
// Step 1: Connect to database
console.log('\n🔌 Connecting to database via SSH tunnel...');
const { connection: conn } = await getDbConnection();
connection = conn;
console.log(' ✅ Connected');
// Step 2: Fetch categories
const categories = await fetchCategories(connection);
// Step 3: Generate embeddings for categories
console.log('\n🧮 Generating embeddings for categories...');
console.log(' This will cost approximately $' + (categories.length * 0.00002).toFixed(4));
const startTime = Date.now();
// Process in batches of 100 (OpenAI limit is 2048)
const BATCH_SIZE = 100;
let totalTokens = 0;
for (let i = 0; i < categories.length; i += BATCH_SIZE) {
const batch = categories.slice(i, i + BATCH_SIZE);
const texts = batch.map(c => c.embeddingText);
const result = await getEmbeddings(texts);
// Attach embeddings to categories
for (let j = 0; j < batch.length; j++) {
batch[j].embedding = result.embeddings[j];
}
totalTokens += result.usage.total_tokens;
console.log(` Batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(categories.length / BATCH_SIZE)}: ${batch.length} categories embedded`);
}
const embeddingTime = Date.now() - startTime;
console.log(` ✅ Generated ${categories.length} embeddings in ${embeddingTime}ms`);
console.log(` 📊 Total tokens used: ${totalTokens} (~$${(totalTokens * 0.00002).toFixed(4)})`);
// Step 4: Test with sample products
console.log('\n═══════════════════════════════════════════════════════════════');
console.log(' TESTING WITH SAMPLE PRODUCTS');
console.log('═══════════════════════════════════════════════════════════════');
for (const product of TEST_PRODUCTS) {
console.log('\n┌─────────────────────────────────────────────────────────────');
console.log(`│ Product: "${product.name}"`);
console.log(`│ Description: "${product.description.substring(0, 60)}..."`);
console.log('├─────────────────────────────────────────────────────────────');
// Generate embedding for product
const productText = `${product.name} ${product.description}`;
const { embeddings: [productEmbedding] } = await getEmbeddings([productText]);
// Find top matches
const matches = findTopMatches(productEmbedding, categories, 10);
console.log('│ Top 10 Category Matches:');
matches.forEach((match, i) => {
const similarity = (match.similarity * 100).toFixed(1);
const bar = '█'.repeat(Math.round(match.similarity * 20));
const marker = i < 3 ? ' ✅' : '';
console.log(`${(i + 1).toString().padStart(2)}. [${similarity.padStart(5)}%] ${bar.padEnd(20)} ${match.fullPath}${marker}`);
});
console.log('└─────────────────────────────────────────────────────────────');
}
// Step 5: Summary
console.log('\n═══════════════════════════════════════════════════════════════');
console.log(' SUMMARY');
console.log('═══════════════════════════════════════════════════════════════');
console.log(` Categories embedded: ${categories.length}`);
console.log(` Embedding time: ${embeddingTime}ms (one-time cost)`);
console.log(` Per-product lookup: ~${(Date.now() - startTime) / TEST_PRODUCTS.length}ms`);
console.log(` Vector dimensions: ${EMBEDDING_DIMENSIONS}`);
console.log(` Memory usage: ~${(categories.length * EMBEDDING_DIMENSIONS * 4 / 1024 / 1024).toFixed(2)} MB (in-memory vectors)`);
console.log('');
console.log(' 💡 In production:');
console.log(' - Category embeddings are computed once and cached');
console.log(' - Only product embedding is computed per-request (~$0.00002)');
console.log(' - Vector search is instant (in-memory cosine similarity)');
console.log(' - Top 10 results go to AI for final selection (~$0.0001)');
console.log('═══════════════════════════════════════════════════════════════\n');
} catch (error) {
console.error('\n❌ Error:', error.message);
if (error.stack) {
console.error(error.stack);
}
process.exit(1);
} finally {
await closeAllConnections();
console.log('🔌 Database connections closed');
}
}
// Run the script
main();

View File

@@ -0,0 +1,281 @@
/**
* AI Routes
*
* API endpoints for AI-powered product validation features.
* Provides embedding generation and similarity-based suggestions.
*/
const express = require('express');
const router = express.Router();
const aiService = require('../services/ai');
const { getDbConnection, closeAllConnections } = require('../utils/dbConnection');
// Track initialization state
let initializationPromise = null;
/**
* Ensure AI service is initialized
* Uses lazy initialization on first request
*/
async function ensureInitialized() {
if (aiService.isReady()) {
return true;
}
if (initializationPromise) {
await initializationPromise;
return aiService.isReady();
}
initializationPromise = (async () => {
try {
console.log('[AI Routes] Initializing AI service...');
// Get database connection for taxonomy
const { connection } = await getDbConnection();
const result = await aiService.initialize({
openaiApiKey: process.env.OPENAI_API_KEY,
mysqlConnection: connection,
logger: console
});
if (!result.success) {
console.error('[AI Routes] AI service initialization failed:', result.message);
return false;
}
console.log('[AI Routes] AI service initialized:', result.stats);
return true;
} catch (error) {
console.error('[AI Routes] Failed to initialize AI service:', error);
return false;
}
})();
await initializationPromise;
return aiService.isReady();
}
/**
* GET /api/ai/status
* Get AI service status
*/
router.get('/status', async (req, res) => {
try {
const status = aiService.getStatus();
res.json(status);
} catch (error) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/ai/initialize
* Manually trigger initialization (also happens automatically on first use)
*/
router.post('/initialize', async (req, res) => {
try {
const ready = await ensureInitialized();
const status = aiService.getStatus();
res.json({
success: ready,
...status
});
} catch (error) {
console.error('[AI Routes] Initialize error:', error);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/ai/taxonomy
* Get all taxonomy data (categories, themes, colors) without embeddings
*/
router.get('/taxonomy', async (req, res) => {
try {
const ready = await ensureInitialized();
if (!ready) {
return res.status(503).json({ error: 'AI service not available' });
}
const taxonomy = aiService.getTaxonomyData();
res.json(taxonomy);
} catch (error) {
console.error('[AI Routes] Taxonomy error:', error);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/ai/embedding
* Generate embedding for a single product
*
* Body: { product: { name, description, company_name, line_name } }
* Returns: { embedding: number[], latencyMs: number }
*/
router.post('/embedding', async (req, res) => {
try {
const ready = await ensureInitialized();
if (!ready) {
return res.status(503).json({ error: 'AI service not available' });
}
const { product } = req.body;
if (!product) {
return res.status(400).json({ error: 'Product is required' });
}
const result = await aiService.getProductEmbedding(product);
res.json(result);
} catch (error) {
console.error('[AI Routes] Embedding error:', error);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/ai/embeddings
* Generate embeddings for multiple products
*
* Body: { products: Array<{ name, description, company_name, line_name }> }
* Returns: { embeddings: Array<{ index, embedding }>, latencyMs }
*/
router.post('/embeddings', async (req, res) => {
try {
const ready = await ensureInitialized();
if (!ready) {
return res.status(503).json({ error: 'AI service not available' });
}
const { products } = req.body;
if (!Array.isArray(products)) {
return res.status(400).json({ error: 'Products array is required' });
}
const result = await aiService.getProductEmbeddings(products);
res.json(result);
} catch (error) {
console.error('[AI Routes] Embeddings error:', error);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/ai/suggestions
* Get category/theme/color suggestions for a single product
* Generates embedding and finds similar taxonomy items
*
* Body: { product: { name, description, company_name, line_name }, options?: { topCategories, topThemes, topColors } }
* Returns: { categories: Array, themes: Array, colors: Array, latencyMs }
*/
router.post('/suggestions', async (req, res) => {
try {
const ready = await ensureInitialized();
if (!ready) {
return res.status(503).json({ error: 'AI service not available' });
}
const { product, options } = req.body;
if (!product) {
return res.status(400).json({ error: 'Product is required' });
}
const suggestions = await aiService.getSuggestionsForProduct(product, options);
res.json(suggestions);
} catch (error) {
console.error('[AI Routes] Suggestions error:', error);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/ai/suggestions/batch
* Get suggestions for multiple products
* More efficient than calling /suggestions multiple times
*
* Body: { products: Array, options?: { topCategories, topThemes, topColors } }
* Returns: { results: Array<{ index, categories, themes, colors }>, latencyMs }
*/
router.post('/suggestions/batch', async (req, res) => {
try {
const ready = await ensureInitialized();
if (!ready) {
return res.status(503).json({ error: 'AI service not available' });
}
const { products, options } = req.body;
if (!Array.isArray(products)) {
return res.status(400).json({ error: 'Products array is required' });
}
const startTime = Date.now();
// Generate all embeddings at once
const { embeddings, latencyMs: embeddingLatency } = await aiService.getProductEmbeddings(products);
// Find suggestions for each embedding
const results = embeddings.map(({ index, embedding }) => {
const suggestions = aiService.findSimilarTaxonomy(embedding, options);
return {
index,
...suggestions
};
});
const totalLatency = Date.now() - startTime;
res.json({
results,
latencyMs: totalLatency,
embeddingLatencyMs: embeddingLatency,
searchLatencyMs: totalLatency - embeddingLatency,
productCount: products.length,
embeddingCount: embeddings.length
});
} catch (error) {
console.error('[AI Routes] Batch suggestions error:', error);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/ai/similar
* Find similar taxonomy items given a pre-computed embedding
* Useful when frontend has cached the embedding
*
* Body: { embedding: number[], options?: { topCategories, topThemes, topColors } }
* Returns: { categories, themes, colors }
*/
router.post('/similar', async (req, res) => {
try {
const ready = await ensureInitialized();
if (!ready) {
return res.status(503).json({ error: 'AI service not available' });
}
const { embedding, options } = req.body;
if (!embedding || !Array.isArray(embedding)) {
return res.status(400).json({ error: 'Embedding array is required' });
}
const startTime = Date.now();
const suggestions = aiService.findSimilarTaxonomy(embedding, options);
res.json({
...suggestions,
latencyMs: Date.now() - startTime
});
} catch (error) {
console.error('[AI Routes] Similar error:', error);
res.status(500).json({ error: error.message });
}
});
module.exports = router;

View File

@@ -15,6 +15,7 @@ const configRouter = require('./routes/config');
const metricsRouter = require('./routes/metrics');
const importRouter = require('./routes/import');
const aiValidationRouter = require('./routes/ai-validation');
const aiRouter = require('./routes/ai');
const templatesRouter = require('./routes/templates');
const aiPromptsRouter = require('./routes/ai-prompts');
const reusableImagesRouter = require('./routes/reusable-images');
@@ -124,6 +125,7 @@ async function startServer() {
app.use('/api/brands-aggregate', brandsAggregateRouter);
app.use('/api/import', importRouter);
app.use('/api/ai-validation', aiValidationRouter);
app.use('/api/ai', aiRouter);
app.use('/api/templates', templatesRouter);
app.use('/api/ai-prompts', aiPromptsRouter);
app.use('/api/reusable-images', reusableImagesRouter);

View File

@@ -0,0 +1,82 @@
/**
* Vector similarity utilities
*/
/**
* Compute cosine similarity between two vectors
* @param {number[]} a
* @param {number[]} b
* @returns {number} Similarity score between -1 and 1
*/
function cosineSimilarity(a, b) {
if (!a || !b || a.length !== b.length) {
return 0;
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
if (denominator === 0) return 0;
return dotProduct / denominator;
}
/**
* Find top K most similar items from a collection
* @param {number[]} queryEmbedding - The embedding to search for
* @param {Array<{id: any, embedding: number[]}>} items - Items with embeddings
* @param {number} topK - Number of results to return
* @returns {Array<{id: any, similarity: number}>}
*/
function findTopMatches(queryEmbedding, items, topK = 10) {
if (!queryEmbedding || !items || items.length === 0) {
return [];
}
const scored = items.map(item => ({
id: item.id,
similarity: cosineSimilarity(queryEmbedding, item.embedding)
}));
scored.sort((a, b) => b.similarity - a.similarity);
return scored.slice(0, topK);
}
/**
* Find matches above a similarity threshold
* @param {number[]} queryEmbedding
* @param {Array<{id: any, embedding: number[]}>} items
* @param {number} threshold - Minimum similarity (0-1)
* @returns {Array<{id: any, similarity: number}>}
*/
function findMatchesAboveThreshold(queryEmbedding, items, threshold = 0.5) {
if (!queryEmbedding || !items || items.length === 0) {
return [];
}
const scored = items
.map(item => ({
id: item.id,
similarity: cosineSimilarity(queryEmbedding, item.embedding)
}))
.filter(item => item.similarity >= threshold);
scored.sort((a, b) => b.similarity - a.similarity);
return scored;
}
module.exports = {
cosineSimilarity,
findTopMatches,
findMatchesAboveThreshold
};

View File

@@ -0,0 +1,323 @@
/**
* Taxonomy Embedding Service
*
* Generates and caches embeddings for categories, themes, and colors.
* Excludes "Black Friday", "Gifts", "Deals" categories and their children.
*/
const { findTopMatches } = require('./similarity');
// Categories to exclude (and all their children)
const EXCLUDED_CATEGORY_NAMES = ['black friday', 'gifts', 'deals'];
class TaxonomyEmbeddings {
constructor({ provider, logger }) {
this.provider = provider;
this.logger = logger || console;
// Cached taxonomy with embeddings
this.categories = [];
this.themes = [];
this.colors = [];
// Raw data without embeddings (for lookup)
this.categoryMap = new Map();
this.themeMap = new Map();
this.colorMap = new Map();
this.initialized = false;
this.initializing = false;
}
/**
* Initialize embeddings - fetch taxonomy and generate embeddings
*/
async initialize(connection) {
if (this.initialized) {
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
}
if (this.initializing) {
// Wait for existing initialization
while (this.initializing) {
await new Promise(resolve => setTimeout(resolve, 100));
}
return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
}
this.initializing = true;
try {
this.logger.info('[TaxonomyEmbeddings] Starting initialization...');
// Fetch raw taxonomy data
const [categories, themes, colors] = await Promise.all([
this._fetchCategories(connection),
this._fetchThemes(connection),
this._fetchColors(connection)
]);
this.logger.info(`[TaxonomyEmbeddings] Fetched ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
// Generate embeddings in parallel
const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
this._generateEmbeddings(categories, 'categories'),
this._generateEmbeddings(themes, 'themes'),
this._generateEmbeddings(colors, 'colors')
]);
// Store with embeddings
this.categories = catEmbeddings;
this.themes = themeEmbeddings;
this.colors = colorEmbeddings;
// Build lookup maps
this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
this.themeMap = new Map(this.themes.map(t => [t.id, t]));
this.colorMap = new Map(this.colors.map(c => [c.id, c]));
this.initialized = true;
this.logger.info('[TaxonomyEmbeddings] Initialization complete');
return {
categories: this.categories.length,
themes: this.themes.length,
colors: this.colors.length
};
} catch (error) {
this.logger.error('[TaxonomyEmbeddings] Initialization failed:', error);
throw error;
} finally {
this.initializing = false;
}
}
/**
* Find similar categories for a product embedding
*/
findSimilarCategories(productEmbedding, topK = 10) {
if (!this.initialized || !productEmbedding) {
return [];
}
const matches = findTopMatches(productEmbedding, this.categories, topK);
return matches.map(match => {
const cat = this.categoryMap.get(match.id);
return {
id: match.id,
name: cat?.name || '',
fullPath: cat?.fullPath || '',
similarity: match.similarity
};
});
}
/**
* Find similar themes for a product embedding
*/
findSimilarThemes(productEmbedding, topK = 5) {
if (!this.initialized || !productEmbedding) {
return [];
}
const matches = findTopMatches(productEmbedding, this.themes, topK);
return matches.map(match => {
const theme = this.themeMap.get(match.id);
return {
id: match.id,
name: theme?.name || '',
fullPath: theme?.fullPath || '',
similarity: match.similarity
};
});
}
/**
* Find similar colors for a product embedding
*/
findSimilarColors(productEmbedding, topK = 5) {
if (!this.initialized || !productEmbedding) {
return [];
}
const matches = findTopMatches(productEmbedding, this.colors, topK);
return matches.map(match => {
const color = this.colorMap.get(match.id);
return {
id: match.id,
name: color?.name || '',
similarity: match.similarity
};
});
}
/**
* Get all taxonomy data (without embeddings) for frontend
*/
getTaxonomyData() {
return {
categories: this.categories.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })),
themes: this.themes.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })),
colors: this.colors.map(({ id, name }) => ({ id, name }))
};
}
/**
* Check if service is ready
*/
isReady() {
return this.initialized;
}
// ============================================================================
// Private Methods
// ============================================================================
async _fetchCategories(connection) {
// Fetch hierarchical categories (types 10-13)
const [rows] = await connection.query(`
SELECT cat_id, name, master_cat_id, type
FROM product_categories
WHERE type IN (10, 11, 12, 13)
ORDER BY type, name
`);
// Build lookup for hierarchy
const byId = new Map(rows.map(r => [r.cat_id, r]));
// Find IDs of excluded top-level categories and all their descendants
const excludedIds = new Set();
// First pass: find excluded top-level categories
for (const row of rows) {
if (row.type === 10 && EXCLUDED_CATEGORY_NAMES.includes(row.name.toLowerCase())) {
excludedIds.add(row.cat_id);
}
}
// Multiple passes to find all descendants
let foundNew = true;
while (foundNew) {
foundNew = false;
for (const row of rows) {
if (!excludedIds.has(row.cat_id) && excludedIds.has(row.master_cat_id)) {
excludedIds.add(row.cat_id);
foundNew = true;
}
}
}
this.logger.info(`[TaxonomyEmbeddings] Excluding ${excludedIds.size} categories (Black Friday, Gifts, Deals and children)`);
// Build category objects with full paths, excluding filtered ones
const categories = [];
for (const row of rows) {
if (excludedIds.has(row.cat_id)) {
continue;
}
const path = [];
let current = row;
// Walk up the tree to build full path
while (current) {
path.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
categories.push({
id: row.cat_id,
name: row.name,
parentId: row.master_cat_id,
type: row.type,
fullPath: path.join(' > '),
embeddingText: path.join(' ')
});
}
return categories;
}
async _fetchThemes(connection) {
// Fetch themes (types 20-21)
const [rows] = await connection.query(`
SELECT cat_id, name, master_cat_id, type
FROM product_categories
WHERE type IN (20, 21)
ORDER BY type, name
`);
const byId = new Map(rows.map(r => [r.cat_id, r]));
const themes = [];
for (const row of rows) {
const path = [];
let current = row;
while (current) {
path.unshift(current.name);
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
}
themes.push({
id: row.cat_id,
name: row.name,
parentId: row.master_cat_id,
type: row.type,
fullPath: path.join(' > '),
embeddingText: path.join(' ')
});
}
return themes;
}
async _fetchColors(connection) {
const [rows] = await connection.query(`
SELECT color, name, hex_color
FROM product_color_list
ORDER BY \`order\`
`);
return rows.map(row => ({
id: row.color,
name: row.name,
hexColor: row.hex_color,
embeddingText: row.name
}));
}
async _generateEmbeddings(items, label) {
if (items.length === 0) {
return items;
}
const startTime = Date.now();
const texts = items.map(item => item.embeddingText);
const results = [...items];
// Process in batches
let batchNum = 0;
for await (const chunk of this.provider.embedBatchChunked(texts, { batchSize: 100 })) {
batchNum++;
for (let i = 0; i < chunk.embeddings.length; i++) {
const globalIndex = chunk.startIndex + i;
results[globalIndex] = {
...results[globalIndex],
embedding: chunk.embeddings[i]
};
}
}
const elapsed = Date.now() - startTime;
this.logger.info(`[TaxonomyEmbeddings] Generated ${items.length} ${label} embeddings in ${elapsed}ms`);
return results;
}
}
module.exports = { TaxonomyEmbeddings };

View File

@@ -0,0 +1,273 @@
/**
* AI Service
*
* Main entry point for AI functionality including embeddings.
* Provides embedding generation and similarity search for product validation.
*/
const { OpenAIProvider } = require('./providers/openaiProvider');
const { TaxonomyEmbeddings } = require('./embeddings/taxonomyEmbeddings');
const { cosineSimilarity, findTopMatches } = require('./embeddings/similarity');
let initialized = false;
let initializing = false;
let openaiProvider = null;
let taxonomyEmbeddings = null;
let logger = console;
/**
* Initialize the AI service
* @param {Object} options
* @param {string} options.openaiApiKey - OpenAI API key
* @param {Object} options.mysqlConnection - MySQL connection for taxonomy data
* @param {Object} [options.logger] - Logger instance
*/
async function initialize({ openaiApiKey, mysqlConnection, logger: customLogger }) {
if (initialized) {
return { success: true, message: 'Already initialized' };
}
if (initializing) {
// Wait for existing initialization
while (initializing) {
await new Promise(resolve => setTimeout(resolve, 100));
}
return { success: initialized, message: initialized ? 'Initialized' : 'Initialization failed' };
}
initializing = true;
try {
if (customLogger) {
logger = customLogger;
}
if (!openaiApiKey) {
throw new Error('OpenAI API key is required');
}
logger.info('[AI] Initializing AI service...');
// Create OpenAI provider
openaiProvider = new OpenAIProvider({ apiKey: openaiApiKey });
// Create and initialize taxonomy embeddings
taxonomyEmbeddings = new TaxonomyEmbeddings({
provider: openaiProvider,
logger
});
const stats = await taxonomyEmbeddings.initialize(mysqlConnection);
initialized = true;
logger.info('[AI] AI service initialized', stats);
return {
success: true,
message: 'Initialized',
stats
};
} catch (error) {
logger.error('[AI] Initialization failed:', error);
return { success: false, message: error.message };
} finally {
initializing = false;
}
}
/**
* Check if service is ready
*/
function isReady() {
return initialized && taxonomyEmbeddings?.isReady();
}
/**
* Build weighted product text for embedding.
* Weights the product name heavily by repeating it, and truncates long descriptions
* to prevent verbose marketing copy from drowning out the product signal.
*
* @param {Object} product - Product with name, description, company, line
* @returns {string} - Combined text for embedding
*/
function buildProductText(product) {
const parts = [];
const name = product.name?.trim();
const description = product.description?.trim();
const company = (product.company_name || product.company)?.trim();
const line = (product.line_name || product.line)?.trim();
// Name is most important - repeat 3x to weight it heavily in the embedding
if (name) {
parts.push(name, name, name);
}
// Company and line provide context
if (company) {
parts.push(company);
}
if (line) {
parts.push(line);
}
// Truncate description to prevent it from overwhelming the signal
if (description) {
const truncated = description.length > 500
? description.substring(0, 500) + '...'
: description;
parts.push(truncated);
}
return parts.join(' ').trim();
}
/**
* Generate embedding for a product
* @param {Object} product - Product with name, description, company, line
* @returns {Promise<{embedding: number[], latencyMs: number}>}
*/
async function getProductEmbedding(product) {
if (!initialized || !openaiProvider) {
throw new Error('AI service not initialized');
}
const text = buildProductText(product);
if (!text) {
return { embedding: null, latencyMs: 0 };
}
const result = await openaiProvider.embed(text);
return {
embedding: result.embeddings[0],
latencyMs: result.latencyMs
};
}
/**
* Generate embeddings for multiple products
* @param {Object[]} products - Array of products
* @returns {Promise<{embeddings: Array<{index: number, embedding: number[]}>, latencyMs: number}>}
*/
async function getProductEmbeddings(products) {
if (!initialized || !openaiProvider) {
throw new Error('AI service not initialized');
}
const texts = products.map(buildProductText);
// Track which products have empty text
const validIndices = texts.map((t, i) => t ? i : -1).filter(i => i >= 0);
const validTexts = texts.filter(t => t);
if (validTexts.length === 0) {
return { embeddings: [], latencyMs: 0 };
}
const result = await openaiProvider.embed(validTexts);
// Map embeddings back to original indices
const embeddings = validIndices.map((originalIndex, resultIndex) => ({
index: originalIndex,
embedding: result.embeddings[resultIndex]
}));
return {
embeddings,
latencyMs: result.latencyMs
};
}
/**
* Find similar taxonomy items for a product embedding
* @param {number[]} productEmbedding
* @param {Object} options
* @returns {{categories: Array, themes: Array, colors: Array}}
*/
function findSimilarTaxonomy(productEmbedding, options = {}) {
if (!initialized || !taxonomyEmbeddings) {
throw new Error('AI service not initialized');
}
const topCategories = options.topCategories ?? 10;
const topThemes = options.topThemes ?? 5;
const topColors = options.topColors ?? 5;
return {
categories: taxonomyEmbeddings.findSimilarCategories(productEmbedding, topCategories),
themes: taxonomyEmbeddings.findSimilarThemes(productEmbedding, topThemes),
colors: taxonomyEmbeddings.findSimilarColors(productEmbedding, topColors)
};
}
/**
* Get product embedding and find similar taxonomy in one call
* @param {Object} product
* @param {Object} options
*/
async function getSuggestionsForProduct(product, options = {}) {
const { embedding, latencyMs: embeddingLatency } = await getProductEmbedding(product);
if (!embedding) {
return {
categories: [],
themes: [],
colors: [],
latencyMs: embeddingLatency
};
}
const startSearch = Date.now();
const suggestions = findSimilarTaxonomy(embedding, options);
const searchLatency = Date.now() - startSearch;
return {
...suggestions,
latencyMs: embeddingLatency + searchLatency,
embeddingLatencyMs: embeddingLatency,
searchLatencyMs: searchLatency
};
}
/**
* Get all taxonomy data (without embeddings) for frontend
*/
function getTaxonomyData() {
if (!initialized || !taxonomyEmbeddings) {
throw new Error('AI service not initialized');
}
return taxonomyEmbeddings.getTaxonomyData();
}
/**
* Get service status
*/
function getStatus() {
return {
initialized,
ready: isReady(),
hasProvider: !!openaiProvider,
hasTaxonomy: !!taxonomyEmbeddings,
taxonomyStats: taxonomyEmbeddings ? {
categories: taxonomyEmbeddings.categories?.length || 0,
themes: taxonomyEmbeddings.themes?.length || 0,
colors: taxonomyEmbeddings.colors?.length || 0
} : null
};
}
module.exports = {
initialize,
isReady,
getProductEmbedding,
getProductEmbeddings,
findSimilarTaxonomy,
getSuggestionsForProduct,
getTaxonomyData,
getStatus,
// Re-export utilities
cosineSimilarity,
findTopMatches
};

View File

@@ -0,0 +1,117 @@
/**
* OpenAI Provider - Handles embedding generation
*/
const EMBEDDING_MODEL = 'text-embedding-3-small';
const EMBEDDING_DIMENSIONS = 1536;
const MAX_BATCH_SIZE = 2048;
class OpenAIProvider {
constructor({ apiKey, baseUrl = 'https://api.openai.com/v1', timeoutMs = 60000 }) {
if (!apiKey) {
throw new Error('OpenAI API key is required');
}
this.apiKey = apiKey;
this.baseUrl = baseUrl;
this.timeoutMs = timeoutMs;
}
/**
* Generate embeddings for one or more texts
* @param {string|string[]} input - Text or array of texts
* @param {Object} options
* @returns {Promise<{embeddings: number[][], usage: Object, model: string, latencyMs: number}>}
*/
async embed(input, options = {}) {
const texts = Array.isArray(input) ? input : [input];
const model = options.model || EMBEDDING_MODEL;
const dimensions = options.dimensions || EMBEDDING_DIMENSIONS;
const timeoutMs = options.timeoutMs || this.timeoutMs;
if (texts.length > MAX_BATCH_SIZE) {
throw new Error(`Batch size ${texts.length} exceeds max of ${MAX_BATCH_SIZE}`);
}
const started = Date.now();
// Clean and truncate input texts
const cleanedTexts = texts.map(t =>
(t || '').replace(/\n+/g, ' ').trim().substring(0, 8000)
);
const body = {
input: cleanedTexts,
model,
encoding_format: 'float'
};
// Only embedding-3 models support dimensions parameter
if (model.includes('embedding-3')) {
body.dimensions = dimensions;
}
const response = await this._makeRequest('embeddings', body, timeoutMs);
// Sort by index to ensure order matches input
const sortedData = response.data.sort((a, b) => a.index - b.index);
return {
embeddings: sortedData.map(item => item.embedding),
usage: {
promptTokens: response.usage?.prompt_tokens || 0,
totalTokens: response.usage?.total_tokens || 0
},
model: response.model || model,
latencyMs: Date.now() - started
};
}
/**
* Generator for processing large batches in chunks
*/
async *embedBatchChunked(texts, options = {}) {
const batchSize = Math.min(options.batchSize || 100, MAX_BATCH_SIZE);
for (let i = 0; i < texts.length; i += batchSize) {
const chunk = texts.slice(i, i + batchSize);
const result = await this.embed(chunk, options);
yield {
embeddings: result.embeddings,
startIndex: i,
endIndex: i + chunk.length,
usage: result.usage,
model: result.model,
latencyMs: result.latencyMs
};
}
}
async _makeRequest(endpoint, body, timeoutMs) {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${this.baseUrl}/${endpoint}`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`
},
body: JSON.stringify(body),
signal: controller.signal
});
if (!response.ok) {
const error = await response.json().catch(() => ({}));
throw new Error(error.error?.message || `OpenAI API error: ${response.status}`);
}
return response.json();
} finally {
clearTimeout(timeout);
}
}
}
module.exports = { OpenAIProvider, EMBEDDING_MODEL, EMBEDDING_DIMENSIONS };