Add AI embeddings and suggestions for categories, a few validation step tweaks/fixes

2026-01-19 11:34:55 -05:00
parent 9ce84fe5b9
commit 43d76e011d
20 changed files with 5311 additions and 176 deletions
--- a/inventory-server/scripts/embedding-poc.js
+++ b/inventory-server/scripts/embedding-poc.js
@@ -0,0 +1,283 @@
+#!/usr/bin/env node
+/**
+ * Embedding Proof-of-Concept Script
+ *
+ * Demonstrates how category embeddings work for product matching.
+ * Uses OpenAI text-embedding-3-small model.
+ *
+ * Usage: node scripts/embedding-poc.js
+ */
+
+const path = require('path');
+require('dotenv').config({ path: path.join(__dirname, '../.env') });
+
+const { getDbConnection, closeAllConnections } = require('../src/utils/dbConnection');
+
+// ============================================================================
+// Configuration
+// ============================================================================
+
+const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
+const EMBEDDING_MODEL = 'text-embedding-3-small';
+const EMBEDDING_DIMENSIONS = 1536;
+
+// Sample products to test (you can modify these)
+const TEST_PRODUCTS = [
+  {
+    name: "Cosmos Infinity Chipboard - Stamperia",
+    description: "Laser-cut chipboard shapes featuring celestial designs for mixed media projects"
+  },
+  {
+    name: "Distress Oxide Ink Pad - Mermaid Lagoon",
+    description: "Water-reactive dye ink that creates an oxidized effect"
+  },
+  {
+    name: "Hedwig Puffy Stickers - Paper House Productions",
+    description: "3D puffy stickers featuring Harry Potter's owl Hedwig"
+  },
+  {
+    name: "Black Velvet Watercolor Brush Size 6",
+    description: "Round brush for watercolor painting with synthetic bristles"
+  },
+  {
+    name: "Floral Washi Tape Set",
+    description: "Decorative paper tape with flower patterns, pack of 6 rolls"
+  }
+];
+
+// ============================================================================
+// OpenAI Embedding Functions
+// ============================================================================
+
+async function getEmbeddings(texts) {
+  const response = await fetch('https://api.openai.com/v1/embeddings', {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${OPENAI_API_KEY}`
+    },
+    body: JSON.stringify({
+      input: texts.map(t => t.substring(0, 8000)), // Max 8k chars per text
+      model: EMBEDDING_MODEL,
+      dimensions: EMBEDDING_DIMENSIONS
+    })
+  });
+
+  if (!response.ok) {
+    const error = await response.json();
+    throw new Error(`OpenAI API error: ${error.error?.message || response.status}`);
+  }
+
+  const data = await response.json();
+
+  // Sort by index to ensure order matches input
+  const sorted = data.data.sort((a, b) => a.index - b.index);
+
+  return {
+    embeddings: sorted.map(item => item.embedding),
+    usage: data.usage,
+    model: data.model
+  };
+}
+
+// ============================================================================
+// Vector Math
+// ============================================================================
+
+function cosineSimilarity(a, b) {
+  let dotProduct = 0;
+  let normA = 0;
+  let normB = 0;
+
+  for (let i = 0; i < a.length; i++) {
+    dotProduct += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
+  }
+
+  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+}
+
+function findTopMatches(queryEmbedding, categoryEmbeddings, topK = 10) {
+  const scored = categoryEmbeddings.map(cat => ({
+    ...cat,
+    similarity: cosineSimilarity(queryEmbedding, cat.embedding)
+  }));
+
+  scored.sort((a, b) => b.similarity - a.similarity);
+
+  return scored.slice(0, topK);
+}
+
+// ============================================================================
+// Database Functions
+// ============================================================================
+
+async function fetchCategories(connection) {
+  console.log('\n📂 Fetching categories from database...');
+
+  // Fetch hierarchical categories (types 10-13)
+  const [rows] = await connection.query(`
+    SELECT
+      cat_id,
+      name,
+      master_cat_id,
+      type
+    FROM product_categories
+    WHERE type IN (10, 11, 12, 13)
+    ORDER BY type, name
+  `);
+
+  console.log(`   Found ${rows.length} category records`);
+
+  // Build category paths
+  const byId = new Map(rows.map(r => [r.cat_id, r]));
+  const categories = [];
+
+  for (const row of rows) {
+    const path = [];
+    let current = row;
+
+    // Walk up the tree to build full path
+    while (current) {
+      path.unshift(current.name);
+      current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
+    }
+
+    categories.push({
+      id: row.cat_id,
+      name: row.name,
+      type: row.type,
+      fullPath: path.join(' > '),
+      embeddingText: path.join(' ') // For embedding generation
+    });
+  }
+
+  // Count by level
+  const levels = {
+    10: categories.filter(c => c.type === 10).length,
+    11: categories.filter(c => c.type === 11).length,
+    12: categories.filter(c => c.type === 12).length,
+    13: categories.filter(c => c.type === 13).length,
+  };
+
+  console.log(`   Level breakdown: ${levels[10]} top-level, ${levels[11]} L2, ${levels[12]} L3, ${levels[13]} L4`);
+
+  return categories;
+}
+
+// ============================================================================
+// Main Script
+// ============================================================================
+
+async function main() {
+  console.log('═══════════════════════════════════════════════════════════════');
+  console.log('  EMBEDDING PROOF-OF-CONCEPT');
+  console.log('  Model: ' + EMBEDDING_MODEL);
+  console.log('═══════════════════════════════════════════════════════════════');
+
+  if (!OPENAI_API_KEY) {
+    console.error('❌ OPENAI_API_KEY not found in environment');
+    process.exit(1);
+  }
+
+  let connection;
+
+  try {
+    // Step 1: Connect to database
+    console.log('\n🔌 Connecting to database via SSH tunnel...');
+    const { connection: conn } = await getDbConnection();
+    connection = conn;
+    console.log('   ✅ Connected');
+
+    // Step 2: Fetch categories
+    const categories = await fetchCategories(connection);
+
+    // Step 3: Generate embeddings for categories
+    console.log('\n🧮 Generating embeddings for categories...');
+    console.log('   This will cost approximately $' + (categories.length * 0.00002).toFixed(4));
+
+    const startTime = Date.now();
+
+    // Process in batches of 100 (OpenAI limit is 2048)
+    const BATCH_SIZE = 100;
+    let totalTokens = 0;
+
+    for (let i = 0; i < categories.length; i += BATCH_SIZE) {
+      const batch = categories.slice(i, i + BATCH_SIZE);
+      const texts = batch.map(c => c.embeddingText);
+
+      const result = await getEmbeddings(texts);
+
+      // Attach embeddings to categories
+      for (let j = 0; j < batch.length; j++) {
+        batch[j].embedding = result.embeddings[j];
+      }
+
+      totalTokens += result.usage.total_tokens;
+      console.log(`   Batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(categories.length / BATCH_SIZE)}: ${batch.length} categories embedded`);
+    }
+
+    const embeddingTime = Date.now() - startTime;
+    console.log(`   ✅ Generated ${categories.length} embeddings in ${embeddingTime}ms`);
+    console.log(`   📊 Total tokens used: ${totalTokens} (~$${(totalTokens * 0.00002).toFixed(4)})`);
+
+    // Step 4: Test with sample products
+    console.log('\n═══════════════════════════════════════════════════════════════');
+    console.log('  TESTING WITH SAMPLE PRODUCTS');
+    console.log('═══════════════════════════════════════════════════════════════');
+
+    for (const product of TEST_PRODUCTS) {
+      console.log('\n┌─────────────────────────────────────────────────────────────');
+      console.log(`│ Product: "${product.name}"`);
+      console.log(`│ Description: "${product.description.substring(0, 60)}..."`);
+      console.log('├─────────────────────────────────────────────────────────────');
+
+      // Generate embedding for product
+      const productText = `${product.name} ${product.description}`;
+      const { embeddings: [productEmbedding] } = await getEmbeddings([productText]);
+
+      // Find top matches
+      const matches = findTopMatches(productEmbedding, categories, 10);
+
+      console.log('│ Top 10 Category Matches:');
+      matches.forEach((match, i) => {
+        const similarity = (match.similarity * 100).toFixed(1);
+        const bar = '█'.repeat(Math.round(match.similarity * 20));
+        const marker = i < 3 ? ' ✅' : '';
+        console.log(`│  ${(i + 1).toString().padStart(2)}. [${similarity.padStart(5)}%] ${bar.padEnd(20)} ${match.fullPath}${marker}`);
+      });
+      console.log('└─────────────────────────────────────────────────────────────');
+    }
+
+    // Step 5: Summary
+    console.log('\n═══════════════════════════════════════════════════════════════');
+    console.log('  SUMMARY');
+    console.log('═══════════════════════════════════════════════════════════════');
+    console.log(`  Categories embedded: ${categories.length}`);
+    console.log(`  Embedding time: ${embeddingTime}ms (one-time cost)`);
+    console.log(`  Per-product lookup: ~${(Date.now() - startTime) / TEST_PRODUCTS.length}ms`);
+    console.log(`  Vector dimensions: ${EMBEDDING_DIMENSIONS}`);
+    console.log(`  Memory usage: ~${(categories.length * EMBEDDING_DIMENSIONS * 4 / 1024 / 1024).toFixed(2)} MB (in-memory vectors)`);
+    console.log('');
+    console.log('  💡 In production:');
+    console.log('     - Category embeddings are computed once and cached');
+    console.log('     - Only product embedding is computed per-request (~$0.00002)');
+    console.log('     - Vector search is instant (in-memory cosine similarity)');
+    console.log('     - Top 10 results go to AI for final selection (~$0.0001)');
+    console.log('═══════════════════════════════════════════════════════════════\n');
+
+  } catch (error) {
+    console.error('\n❌ Error:', error.message);
+    if (error.stack) {
+      console.error(error.stack);
+    }
+    process.exit(1);
+  } finally {
+    await closeAllConnections();
+    console.log('🔌 Database connections closed');
+  }
+}
+
+// Run the script
+main();
--- a/inventory-server/src/routes/ai.js
+++ b/inventory-server/src/routes/ai.js
@@ -0,0 +1,281 @@
+/**
+ * AI Routes
+ *
+ * API endpoints for AI-powered product validation features.
+ * Provides embedding generation and similarity-based suggestions.
+ */
+
+const express = require('express');
+const router = express.Router();
+const aiService = require('../services/ai');
+const { getDbConnection, closeAllConnections } = require('../utils/dbConnection');
+
+// Track initialization state
+let initializationPromise = null;
+
+/**
+ * Ensure AI service is initialized
+ * Uses lazy initialization on first request
+ */
+async function ensureInitialized() {
+  if (aiService.isReady()) {
+    return true;
+  }
+
+  if (initializationPromise) {
+    await initializationPromise;
+    return aiService.isReady();
+  }
+
+  initializationPromise = (async () => {
+    try {
+      console.log('[AI Routes] Initializing AI service...');
+
+      // Get database connection for taxonomy
+      const { connection } = await getDbConnection();
+
+      const result = await aiService.initialize({
+        openaiApiKey: process.env.OPENAI_API_KEY,
+        mysqlConnection: connection,
+        logger: console
+      });
+
+      if (!result.success) {
+        console.error('[AI Routes] AI service initialization failed:', result.message);
+        return false;
+      }
+
+      console.log('[AI Routes] AI service initialized:', result.stats);
+      return true;
+    } catch (error) {
+      console.error('[AI Routes] Failed to initialize AI service:', error);
+      return false;
+    }
+  })();
+
+  await initializationPromise;
+  return aiService.isReady();
+}
+
+/**
+ * GET /api/ai/status
+ * Get AI service status
+ */
+router.get('/status', async (req, res) => {
+  try {
+    const status = aiService.getStatus();
+    res.json(status);
+  } catch (error) {
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * POST /api/ai/initialize
+ * Manually trigger initialization (also happens automatically on first use)
+ */
+router.post('/initialize', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    const status = aiService.getStatus();
+
+    res.json({
+      success: ready,
+      ...status
+    });
+  } catch (error) {
+    console.error('[AI Routes] Initialize error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * GET /api/ai/taxonomy
+ * Get all taxonomy data (categories, themes, colors) without embeddings
+ */
+router.get('/taxonomy', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    if (!ready) {
+      return res.status(503).json({ error: 'AI service not available' });
+    }
+
+    const taxonomy = aiService.getTaxonomyData();
+    res.json(taxonomy);
+  } catch (error) {
+    console.error('[AI Routes] Taxonomy error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * POST /api/ai/embedding
+ * Generate embedding for a single product
+ *
+ * Body: { product: { name, description, company_name, line_name } }
+ * Returns: { embedding: number[], latencyMs: number }
+ */
+router.post('/embedding', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    if (!ready) {
+      return res.status(503).json({ error: 'AI service not available' });
+    }
+
+    const { product } = req.body;
+
+    if (!product) {
+      return res.status(400).json({ error: 'Product is required' });
+    }
+
+    const result = await aiService.getProductEmbedding(product);
+    res.json(result);
+  } catch (error) {
+    console.error('[AI Routes] Embedding error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * POST /api/ai/embeddings
+ * Generate embeddings for multiple products
+ *
+ * Body: { products: Array<{ name, description, company_name, line_name }> }
+ * Returns: { embeddings: Array<{ index, embedding }>, latencyMs }
+ */
+router.post('/embeddings', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    if (!ready) {
+      return res.status(503).json({ error: 'AI service not available' });
+    }
+
+    const { products } = req.body;
+
+    if (!Array.isArray(products)) {
+      return res.status(400).json({ error: 'Products array is required' });
+    }
+
+    const result = await aiService.getProductEmbeddings(products);
+    res.json(result);
+  } catch (error) {
+    console.error('[AI Routes] Embeddings error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * POST /api/ai/suggestions
+ * Get category/theme/color suggestions for a single product
+ * Generates embedding and finds similar taxonomy items
+ *
+ * Body: { product: { name, description, company_name, line_name }, options?: { topCategories, topThemes, topColors } }
+ * Returns: { categories: Array, themes: Array, colors: Array, latencyMs }
+ */
+router.post('/suggestions', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    if (!ready) {
+      return res.status(503).json({ error: 'AI service not available' });
+    }
+
+    const { product, options } = req.body;
+
+    if (!product) {
+      return res.status(400).json({ error: 'Product is required' });
+    }
+
+    const suggestions = await aiService.getSuggestionsForProduct(product, options);
+    res.json(suggestions);
+  } catch (error) {
+    console.error('[AI Routes] Suggestions error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * POST /api/ai/suggestions/batch
+ * Get suggestions for multiple products
+ * More efficient than calling /suggestions multiple times
+ *
+ * Body: { products: Array, options?: { topCategories, topThemes, topColors } }
+ * Returns: { results: Array<{ index, categories, themes, colors }>, latencyMs }
+ */
+router.post('/suggestions/batch', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    if (!ready) {
+      return res.status(503).json({ error: 'AI service not available' });
+    }
+
+    const { products, options } = req.body;
+
+    if (!Array.isArray(products)) {
+      return res.status(400).json({ error: 'Products array is required' });
+    }
+
+    const startTime = Date.now();
+
+    // Generate all embeddings at once
+    const { embeddings, latencyMs: embeddingLatency } = await aiService.getProductEmbeddings(products);
+
+    // Find suggestions for each embedding
+    const results = embeddings.map(({ index, embedding }) => {
+      const suggestions = aiService.findSimilarTaxonomy(embedding, options);
+      return {
+        index,
+        ...suggestions
+      };
+    });
+
+    const totalLatency = Date.now() - startTime;
+
+    res.json({
+      results,
+      latencyMs: totalLatency,
+      embeddingLatencyMs: embeddingLatency,
+      searchLatencyMs: totalLatency - embeddingLatency,
+      productCount: products.length,
+      embeddingCount: embeddings.length
+    });
+  } catch (error) {
+    console.error('[AI Routes] Batch suggestions error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+/**
+ * POST /api/ai/similar
+ * Find similar taxonomy items given a pre-computed embedding
+ * Useful when frontend has cached the embedding
+ *
+ * Body: { embedding: number[], options?: { topCategories, topThemes, topColors } }
+ * Returns: { categories, themes, colors }
+ */
+router.post('/similar', async (req, res) => {
+  try {
+    const ready = await ensureInitialized();
+    if (!ready) {
+      return res.status(503).json({ error: 'AI service not available' });
+    }
+
+    const { embedding, options } = req.body;
+
+    if (!embedding || !Array.isArray(embedding)) {
+      return res.status(400).json({ error: 'Embedding array is required' });
+    }
+
+    const startTime = Date.now();
+    const suggestions = aiService.findSimilarTaxonomy(embedding, options);
+
+    res.json({
+      ...suggestions,
+      latencyMs: Date.now() - startTime
+    });
+  } catch (error) {
+    console.error('[AI Routes] Similar error:', error);
+    res.status(500).json({ error: error.message });
+  }
+});
+
+module.exports = router;
--- a/inventory-server/src/server.js
+++ b/inventory-server/src/server.js
@@ -15,6 +15,7 @@ const configRouter = require('./routes/config');
 const metricsRouter = require('./routes/metrics');
 const importRouter = require('./routes/import');
 const aiValidationRouter = require('./routes/ai-validation');
+const aiRouter = require('./routes/ai');
 const templatesRouter = require('./routes/templates');
 const aiPromptsRouter = require('./routes/ai-prompts');
 const reusableImagesRouter = require('./routes/reusable-images');
@@ -124,6 +125,7 @@ async function startServer() {
    app.use('/api/brands-aggregate', brandsAggregateRouter);
    app.use('/api/import', importRouter);
    app.use('/api/ai-validation', aiValidationRouter);
+    app.use('/api/ai', aiRouter);
    app.use('/api/templates', templatesRouter);
    app.use('/api/ai-prompts', aiPromptsRouter);
    app.use('/api/reusable-images', reusableImagesRouter);
--- a/inventory-server/src/services/ai/embeddings/similarity.js
+++ b/inventory-server/src/services/ai/embeddings/similarity.js
@@ -0,0 +1,82 @@
+/**
+ * Vector similarity utilities
+ */
+
+/**
+ * Compute cosine similarity between two vectors
+ * @param {number[]} a
+ * @param {number[]} b
+ * @returns {number} Similarity score between -1 and 1
+ */
+function cosineSimilarity(a, b) {
+  if (!a || !b || a.length !== b.length) {
+    return 0;
+  }
+
+  let dotProduct = 0;
+  let normA = 0;
+  let normB = 0;
+
+  for (let i = 0; i < a.length; i++) {
+    dotProduct += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
+  }
+
+  const denominator = Math.sqrt(normA) * Math.sqrt(normB);
+  if (denominator === 0) return 0;
+
+  return dotProduct / denominator;
+}
+
+/**
+ * Find top K most similar items from a collection
+ * @param {number[]} queryEmbedding - The embedding to search for
+ * @param {Array<{id: any, embedding: number[]}>} items - Items with embeddings
+ * @param {number} topK - Number of results to return
+ * @returns {Array<{id: any, similarity: number}>}
+ */
+function findTopMatches(queryEmbedding, items, topK = 10) {
+  if (!queryEmbedding || !items || items.length === 0) {
+    return [];
+  }
+
+  const scored = items.map(item => ({
+    id: item.id,
+    similarity: cosineSimilarity(queryEmbedding, item.embedding)
+  }));
+
+  scored.sort((a, b) => b.similarity - a.similarity);
+
+  return scored.slice(0, topK);
+}
+
+/**
+ * Find matches above a similarity threshold
+ * @param {number[]} queryEmbedding
+ * @param {Array<{id: any, embedding: number[]}>} items
+ * @param {number} threshold - Minimum similarity (0-1)
+ * @returns {Array<{id: any, similarity: number}>}
+ */
+function findMatchesAboveThreshold(queryEmbedding, items, threshold = 0.5) {
+  if (!queryEmbedding || !items || items.length === 0) {
+    return [];
+  }
+
+  const scored = items
+    .map(item => ({
+      id: item.id,
+      similarity: cosineSimilarity(queryEmbedding, item.embedding)
+    }))
+    .filter(item => item.similarity >= threshold);
+
+  scored.sort((a, b) => b.similarity - a.similarity);
+
+  return scored;
+}
+
+module.exports = {
+  cosineSimilarity,
+  findTopMatches,
+  findMatchesAboveThreshold
+};
--- a/inventory-server/src/services/ai/embeddings/taxonomyEmbeddings.js
+++ b/inventory-server/src/services/ai/embeddings/taxonomyEmbeddings.js
@@ -0,0 +1,323 @@
+/**
+ * Taxonomy Embedding Service
+ *
+ * Generates and caches embeddings for categories, themes, and colors.
+ * Excludes "Black Friday", "Gifts", "Deals" categories and their children.
+ */
+
+const { findTopMatches } = require('./similarity');
+
+// Categories to exclude (and all their children)
+const EXCLUDED_CATEGORY_NAMES = ['black friday', 'gifts', 'deals'];
+
+class TaxonomyEmbeddings {
+  constructor({ provider, logger }) {
+    this.provider = provider;
+    this.logger = logger || console;
+
+    // Cached taxonomy with embeddings
+    this.categories = [];
+    this.themes = [];
+    this.colors = [];
+
+    // Raw data without embeddings (for lookup)
+    this.categoryMap = new Map();
+    this.themeMap = new Map();
+    this.colorMap = new Map();
+
+    this.initialized = false;
+    this.initializing = false;
+  }
+
+  /**
+   * Initialize embeddings - fetch taxonomy and generate embeddings
+   */
+  async initialize(connection) {
+    if (this.initialized) {
+      return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
+    }
+
+    if (this.initializing) {
+      // Wait for existing initialization
+      while (this.initializing) {
+        await new Promise(resolve => setTimeout(resolve, 100));
+      }
+      return { categories: this.categories.length, themes: this.themes.length, colors: this.colors.length };
+    }
+
+    this.initializing = true;
+
+    try {
+      this.logger.info('[TaxonomyEmbeddings] Starting initialization...');
+
+      // Fetch raw taxonomy data
+      const [categories, themes, colors] = await Promise.all([
+        this._fetchCategories(connection),
+        this._fetchThemes(connection),
+        this._fetchColors(connection)
+      ]);
+
+      this.logger.info(`[TaxonomyEmbeddings] Fetched ${categories.length} categories, ${themes.length} themes, ${colors.length} colors`);
+
+      // Generate embeddings in parallel
+      const [catEmbeddings, themeEmbeddings, colorEmbeddings] = await Promise.all([
+        this._generateEmbeddings(categories, 'categories'),
+        this._generateEmbeddings(themes, 'themes'),
+        this._generateEmbeddings(colors, 'colors')
+      ]);
+
+      // Store with embeddings
+      this.categories = catEmbeddings;
+      this.themes = themeEmbeddings;
+      this.colors = colorEmbeddings;
+
+      // Build lookup maps
+      this.categoryMap = new Map(this.categories.map(c => [c.id, c]));
+      this.themeMap = new Map(this.themes.map(t => [t.id, t]));
+      this.colorMap = new Map(this.colors.map(c => [c.id, c]));
+
+      this.initialized = true;
+      this.logger.info('[TaxonomyEmbeddings] Initialization complete');
+
+      return {
+        categories: this.categories.length,
+        themes: this.themes.length,
+        colors: this.colors.length
+      };
+    } catch (error) {
+      this.logger.error('[TaxonomyEmbeddings] Initialization failed:', error);
+      throw error;
+    } finally {
+      this.initializing = false;
+    }
+  }
+
+  /**
+   * Find similar categories for a product embedding
+   */
+  findSimilarCategories(productEmbedding, topK = 10) {
+    if (!this.initialized || !productEmbedding) {
+      return [];
+    }
+
+    const matches = findTopMatches(productEmbedding, this.categories, topK);
+
+    return matches.map(match => {
+      const cat = this.categoryMap.get(match.id);
+      return {
+        id: match.id,
+        name: cat?.name || '',
+        fullPath: cat?.fullPath || '',
+        similarity: match.similarity
+      };
+    });
+  }
+
+  /**
+   * Find similar themes for a product embedding
+   */
+  findSimilarThemes(productEmbedding, topK = 5) {
+    if (!this.initialized || !productEmbedding) {
+      return [];
+    }
+
+    const matches = findTopMatches(productEmbedding, this.themes, topK);
+
+    return matches.map(match => {
+      const theme = this.themeMap.get(match.id);
+      return {
+        id: match.id,
+        name: theme?.name || '',
+        fullPath: theme?.fullPath || '',
+        similarity: match.similarity
+      };
+    });
+  }
+
+  /**
+   * Find similar colors for a product embedding
+   */
+  findSimilarColors(productEmbedding, topK = 5) {
+    if (!this.initialized || !productEmbedding) {
+      return [];
+    }
+
+    const matches = findTopMatches(productEmbedding, this.colors, topK);
+
+    return matches.map(match => {
+      const color = this.colorMap.get(match.id);
+      return {
+        id: match.id,
+        name: color?.name || '',
+        similarity: match.similarity
+      };
+    });
+  }
+
+  /**
+   * Get all taxonomy data (without embeddings) for frontend
+   */
+  getTaxonomyData() {
+    return {
+      categories: this.categories.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })),
+      themes: this.themes.map(({ id, name, fullPath, parentId }) => ({ id, name, fullPath, parentId })),
+      colors: this.colors.map(({ id, name }) => ({ id, name }))
+    };
+  }
+
+  /**
+   * Check if service is ready
+   */
+  isReady() {
+    return this.initialized;
+  }
+
+  // ============================================================================
+  // Private Methods
+  // ============================================================================
+
+  async _fetchCategories(connection) {
+    // Fetch hierarchical categories (types 10-13)
+    const [rows] = await connection.query(`
+      SELECT cat_id, name, master_cat_id, type
+      FROM product_categories
+      WHERE type IN (10, 11, 12, 13)
+      ORDER BY type, name
+    `);
+
+    // Build lookup for hierarchy
+    const byId = new Map(rows.map(r => [r.cat_id, r]));
+
+    // Find IDs of excluded top-level categories and all their descendants
+    const excludedIds = new Set();
+
+    // First pass: find excluded top-level categories
+    for (const row of rows) {
+      if (row.type === 10 && EXCLUDED_CATEGORY_NAMES.includes(row.name.toLowerCase())) {
+        excludedIds.add(row.cat_id);
+      }
+    }
+
+    // Multiple passes to find all descendants
+    let foundNew = true;
+    while (foundNew) {
+      foundNew = false;
+      for (const row of rows) {
+        if (!excludedIds.has(row.cat_id) && excludedIds.has(row.master_cat_id)) {
+          excludedIds.add(row.cat_id);
+          foundNew = true;
+        }
+      }
+    }
+
+    this.logger.info(`[TaxonomyEmbeddings] Excluding ${excludedIds.size} categories (Black Friday, Gifts, Deals and children)`);
+
+    // Build category objects with full paths, excluding filtered ones
+    const categories = [];
+
+    for (const row of rows) {
+      if (excludedIds.has(row.cat_id)) {
+        continue;
+      }
+
+      const path = [];
+      let current = row;
+
+      // Walk up the tree to build full path
+      while (current) {
+        path.unshift(current.name);
+        current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
+      }
+
+      categories.push({
+        id: row.cat_id,
+        name: row.name,
+        parentId: row.master_cat_id,
+        type: row.type,
+        fullPath: path.join(' > '),
+        embeddingText: path.join(' ')
+      });
+    }
+
+    return categories;
+  }
+
+  async _fetchThemes(connection) {
+    // Fetch themes (types 20-21)
+    const [rows] = await connection.query(`
+      SELECT cat_id, name, master_cat_id, type
+      FROM product_categories
+      WHERE type IN (20, 21)
+      ORDER BY type, name
+    `);
+
+    const byId = new Map(rows.map(r => [r.cat_id, r]));
+    const themes = [];
+
+    for (const row of rows) {
+      const path = [];
+      let current = row;
+
+      while (current) {
+        path.unshift(current.name);
+        current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
+      }
+
+      themes.push({
+        id: row.cat_id,
+        name: row.name,
+        parentId: row.master_cat_id,
+        type: row.type,
+        fullPath: path.join(' > '),
+        embeddingText: path.join(' ')
+      });
+    }
+
+    return themes;
+  }
+
+  async _fetchColors(connection) {
+    const [rows] = await connection.query(`
+      SELECT color, name, hex_color
+      FROM product_color_list
+      ORDER BY \`order\`
+    `);
+
+    return rows.map(row => ({
+      id: row.color,
+      name: row.name,
+      hexColor: row.hex_color,
+      embeddingText: row.name
+    }));
+  }
+
+  async _generateEmbeddings(items, label) {
+    if (items.length === 0) {
+      return items;
+    }
+
+    const startTime = Date.now();
+    const texts = items.map(item => item.embeddingText);
+    const results = [...items];
+
+    // Process in batches
+    let batchNum = 0;
+    for await (const chunk of this.provider.embedBatchChunked(texts, { batchSize: 100 })) {
+      batchNum++;
+      for (let i = 0; i < chunk.embeddings.length; i++) {
+        const globalIndex = chunk.startIndex + i;
+        results[globalIndex] = {
+          ...results[globalIndex],
+          embedding: chunk.embeddings[i]
+        };
+      }
+    }
+
+    const elapsed = Date.now() - startTime;
+    this.logger.info(`[TaxonomyEmbeddings] Generated ${items.length} ${label} embeddings in ${elapsed}ms`);
+
+    return results;
+  }
+}
+
+module.exports = { TaxonomyEmbeddings };
--- a/inventory-server/src/services/ai/index.js
+++ b/inventory-server/src/services/ai/index.js
@@ -0,0 +1,273 @@
+/**
+ * AI Service
+ *
+ * Main entry point for AI functionality including embeddings.
+ * Provides embedding generation and similarity search for product validation.
+ */
+
+const { OpenAIProvider } = require('./providers/openaiProvider');
+const { TaxonomyEmbeddings } = require('./embeddings/taxonomyEmbeddings');
+const { cosineSimilarity, findTopMatches } = require('./embeddings/similarity');
+
+let initialized = false;
+let initializing = false;
+let openaiProvider = null;
+let taxonomyEmbeddings = null;
+let logger = console;
+
+/**
+ * Initialize the AI service
+ * @param {Object} options
+ * @param {string} options.openaiApiKey - OpenAI API key
+ * @param {Object} options.mysqlConnection - MySQL connection for taxonomy data
+ * @param {Object} [options.logger] - Logger instance
+ */
+async function initialize({ openaiApiKey, mysqlConnection, logger: customLogger }) {
+  if (initialized) {
+    return { success: true, message: 'Already initialized' };
+  }
+
+  if (initializing) {
+    // Wait for existing initialization
+    while (initializing) {
+      await new Promise(resolve => setTimeout(resolve, 100));
+    }
+    return { success: initialized, message: initialized ? 'Initialized' : 'Initialization failed' };
+  }
+
+  initializing = true;
+
+  try {
+    if (customLogger) {
+      logger = customLogger;
+    }
+
+    if (!openaiApiKey) {
+      throw new Error('OpenAI API key is required');
+    }
+
+    logger.info('[AI] Initializing AI service...');
+
+    // Create OpenAI provider
+    openaiProvider = new OpenAIProvider({ apiKey: openaiApiKey });
+
+    // Create and initialize taxonomy embeddings
+    taxonomyEmbeddings = new TaxonomyEmbeddings({
+      provider: openaiProvider,
+      logger
+    });
+
+    const stats = await taxonomyEmbeddings.initialize(mysqlConnection);
+
+    initialized = true;
+    logger.info('[AI] AI service initialized', stats);
+
+    return {
+      success: true,
+      message: 'Initialized',
+      stats
+    };
+  } catch (error) {
+    logger.error('[AI] Initialization failed:', error);
+    return { success: false, message: error.message };
+  } finally {
+    initializing = false;
+  }
+}
+
+/**
+ * Check if service is ready
+ */
+function isReady() {
+  return initialized && taxonomyEmbeddings?.isReady();
+}
+
+/**
+ * Build weighted product text for embedding.
+ * Weights the product name heavily by repeating it, and truncates long descriptions
+ * to prevent verbose marketing copy from drowning out the product signal.
+ *
+ * @param {Object} product - Product with name, description, company, line
+ * @returns {string} - Combined text for embedding
+ */
+function buildProductText(product) {
+  const parts = [];
+  const name = product.name?.trim();
+  const description = product.description?.trim();
+  const company = (product.company_name || product.company)?.trim();
+  const line = (product.line_name || product.line)?.trim();
+
+  // Name is most important - repeat 3x to weight it heavily in the embedding
+  if (name) {
+    parts.push(name, name, name);
+  }
+
+  // Company and line provide context
+  if (company) {
+    parts.push(company);
+  }
+  if (line) {
+    parts.push(line);
+  }
+
+  // Truncate description to prevent it from overwhelming the signal
+  if (description) {
+    const truncated = description.length > 500
+      ? description.substring(0, 500) + '...'
+      : description;
+    parts.push(truncated);
+  }
+
+  return parts.join(' ').trim();
+}
+
+/**
+ * Generate embedding for a product
+ * @param {Object} product - Product with name, description, company, line
+ * @returns {Promise<{embedding: number[], latencyMs: number}>}
+ */
+async function getProductEmbedding(product) {
+  if (!initialized || !openaiProvider) {
+    throw new Error('AI service not initialized');
+  }
+
+  const text = buildProductText(product);
+
+  if (!text) {
+    return { embedding: null, latencyMs: 0 };
+  }
+
+  const result = await openaiProvider.embed(text);
+
+  return {
+    embedding: result.embeddings[0],
+    latencyMs: result.latencyMs
+  };
+}
+
+/**
+ * Generate embeddings for multiple products
+ * @param {Object[]} products - Array of products
+ * @returns {Promise<{embeddings: Array<{index: number, embedding: number[]}>, latencyMs: number}>}
+ */
+async function getProductEmbeddings(products) {
+  if (!initialized || !openaiProvider) {
+    throw new Error('AI service not initialized');
+  }
+
+  const texts = products.map(buildProductText);
+
+  // Track which products have empty text
+  const validIndices = texts.map((t, i) => t ? i : -1).filter(i => i >= 0);
+  const validTexts = texts.filter(t => t);
+
+  if (validTexts.length === 0) {
+    return { embeddings: [], latencyMs: 0 };
+  }
+
+  const result = await openaiProvider.embed(validTexts);
+
+  // Map embeddings back to original indices
+  const embeddings = validIndices.map((originalIndex, resultIndex) => ({
+    index: originalIndex,
+    embedding: result.embeddings[resultIndex]
+  }));
+
+  return {
+    embeddings,
+    latencyMs: result.latencyMs
+  };
+}
+
+/**
+ * Find similar taxonomy items for a product embedding
+ * @param {number[]} productEmbedding
+ * @param {Object} options
+ * @returns {{categories: Array, themes: Array, colors: Array}}
+ */
+function findSimilarTaxonomy(productEmbedding, options = {}) {
+  if (!initialized || !taxonomyEmbeddings) {
+    throw new Error('AI service not initialized');
+  }
+
+  const topCategories = options.topCategories ?? 10;
+  const topThemes = options.topThemes ?? 5;
+  const topColors = options.topColors ?? 5;
+
+  return {
+    categories: taxonomyEmbeddings.findSimilarCategories(productEmbedding, topCategories),
+    themes: taxonomyEmbeddings.findSimilarThemes(productEmbedding, topThemes),
+    colors: taxonomyEmbeddings.findSimilarColors(productEmbedding, topColors)
+  };
+}
+
+/**
+ * Get product embedding and find similar taxonomy in one call
+ * @param {Object} product
+ * @param {Object} options
+ */
+async function getSuggestionsForProduct(product, options = {}) {
+  const { embedding, latencyMs: embeddingLatency } = await getProductEmbedding(product);
+
+  if (!embedding) {
+    return {
+      categories: [],
+      themes: [],
+      colors: [],
+      latencyMs: embeddingLatency
+    };
+  }
+
+  const startSearch = Date.now();
+  const suggestions = findSimilarTaxonomy(embedding, options);
+  const searchLatency = Date.now() - startSearch;
+
+  return {
+    ...suggestions,
+    latencyMs: embeddingLatency + searchLatency,
+    embeddingLatencyMs: embeddingLatency,
+    searchLatencyMs: searchLatency
+  };
+}
+
+/**
+ * Get all taxonomy data (without embeddings) for frontend
+ */
+function getTaxonomyData() {
+  if (!initialized || !taxonomyEmbeddings) {
+    throw new Error('AI service not initialized');
+  }
+
+  return taxonomyEmbeddings.getTaxonomyData();
+}
+
+/**
+ * Get service status
+ */
+function getStatus() {
+  return {
+    initialized,
+    ready: isReady(),
+    hasProvider: !!openaiProvider,
+    hasTaxonomy: !!taxonomyEmbeddings,
+    taxonomyStats: taxonomyEmbeddings ? {
+      categories: taxonomyEmbeddings.categories?.length || 0,
+      themes: taxonomyEmbeddings.themes?.length || 0,
+      colors: taxonomyEmbeddings.colors?.length || 0
+    } : null
+  };
+}
+
+module.exports = {
+  initialize,
+  isReady,
+  getProductEmbedding,
+  getProductEmbeddings,
+  findSimilarTaxonomy,
+  getSuggestionsForProduct,
+  getTaxonomyData,
+  getStatus,
+  // Re-export utilities
+  cosineSimilarity,
+  findTopMatches
+};
--- a/inventory-server/src/services/ai/providers/openaiProvider.js
+++ b/inventory-server/src/services/ai/providers/openaiProvider.js
@@ -0,0 +1,117 @@
+/**
+ * OpenAI Provider - Handles embedding generation
+ */
+
+const EMBEDDING_MODEL = 'text-embedding-3-small';
+const EMBEDDING_DIMENSIONS = 1536;
+const MAX_BATCH_SIZE = 2048;
+
+class OpenAIProvider {
+  constructor({ apiKey, baseUrl = 'https://api.openai.com/v1', timeoutMs = 60000 }) {
+    if (!apiKey) {
+      throw new Error('OpenAI API key is required');
+    }
+    this.apiKey = apiKey;
+    this.baseUrl = baseUrl;
+    this.timeoutMs = timeoutMs;
+  }
+
+  /**
+   * Generate embeddings for one or more texts
+   * @param {string|string[]} input - Text or array of texts
+   * @param {Object} options
+   * @returns {Promise<{embeddings: number[][], usage: Object, model: string, latencyMs: number}>}
+   */
+  async embed(input, options = {}) {
+    const texts = Array.isArray(input) ? input : [input];
+    const model = options.model || EMBEDDING_MODEL;
+    const dimensions = options.dimensions || EMBEDDING_DIMENSIONS;
+    const timeoutMs = options.timeoutMs || this.timeoutMs;
+
+    if (texts.length > MAX_BATCH_SIZE) {
+      throw new Error(`Batch size ${texts.length} exceeds max of ${MAX_BATCH_SIZE}`);
+    }
+
+    const started = Date.now();
+
+    // Clean and truncate input texts
+    const cleanedTexts = texts.map(t =>
+      (t || '').replace(/\n+/g, ' ').trim().substring(0, 8000)
+    );
+
+    const body = {
+      input: cleanedTexts,
+      model,
+      encoding_format: 'float'
+    };
+
+    // Only embedding-3 models support dimensions parameter
+    if (model.includes('embedding-3')) {
+      body.dimensions = dimensions;
+    }
+
+    const response = await this._makeRequest('embeddings', body, timeoutMs);
+
+    // Sort by index to ensure order matches input
+    const sortedData = response.data.sort((a, b) => a.index - b.index);
+
+    return {
+      embeddings: sortedData.map(item => item.embedding),
+      usage: {
+        promptTokens: response.usage?.prompt_tokens || 0,
+        totalTokens: response.usage?.total_tokens || 0
+      },
+      model: response.model || model,
+      latencyMs: Date.now() - started
+    };
+  }
+
+  /**
+   * Generator for processing large batches in chunks
+   */
+  async *embedBatchChunked(texts, options = {}) {
+    const batchSize = Math.min(options.batchSize || 100, MAX_BATCH_SIZE);
+
+    for (let i = 0; i < texts.length; i += batchSize) {
+      const chunk = texts.slice(i, i + batchSize);
+      const result = await this.embed(chunk, options);
+
+      yield {
+        embeddings: result.embeddings,
+        startIndex: i,
+        endIndex: i + chunk.length,
+        usage: result.usage,
+        model: result.model,
+        latencyMs: result.latencyMs
+      };
+    }
+  }
+
+  async _makeRequest(endpoint, body, timeoutMs) {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+    try {
+      const response = await fetch(`${this.baseUrl}/${endpoint}`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          'Authorization': `Bearer ${this.apiKey}`
+        },
+        body: JSON.stringify(body),
+        signal: controller.signal
+      });
+
+      if (!response.ok) {
+        const error = await response.json().catch(() => ({}));
+        throw new Error(error.error?.message || `OpenAI API error: ${response.status}`);
+      }
+
+      return response.json();
+    } finally {
+      clearTimeout(timeout);
+    }
+  }
+}
+
+module.exports = { OpenAIProvider, EMBEDDING_MODEL, EMBEDDING_DIMENSIONS };