#!/usr/bin/env node /** * Embedding Proof-of-Concept Script * * Demonstrates how category embeddings work for product matching. * Uses OpenAI text-embedding-3-small model. * * Usage: node scripts/embedding-poc.js */ const path = require('path'); require('dotenv').config({ path: path.join(__dirname, '../.env') }); const { getDbConnection, closeAllConnections } = require('../src/utils/dbConnection'); // ============================================================================ // Configuration // ============================================================================ const OPENAI_API_KEY = process.env.OPENAI_API_KEY; const EMBEDDING_MODEL = 'text-embedding-3-small'; const EMBEDDING_DIMENSIONS = 1536; // Sample products to test (you can modify these) const TEST_PRODUCTS = [ { name: "Cosmos Infinity Chipboard - Stamperia", description: "Laser-cut chipboard shapes featuring celestial designs for mixed media projects" }, { name: "Distress Oxide Ink Pad - Mermaid Lagoon", description: "Water-reactive dye ink that creates an oxidized effect" }, { name: "Hedwig Puffy Stickers - Paper House Productions", description: "3D puffy stickers featuring Harry Potter's owl Hedwig" }, { name: "Black Velvet Watercolor Brush Size 6", description: "Round brush for watercolor painting with synthetic bristles" }, { name: "Floral Washi Tape Set", description: "Decorative paper tape with flower patterns, pack of 6 rolls" } ]; // ============================================================================ // OpenAI Embedding Functions // ============================================================================ async function getEmbeddings(texts) { const response = await fetch('https://api.openai.com/v1/embeddings', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${OPENAI_API_KEY}` }, body: JSON.stringify({ input: texts.map(t => t.substring(0, 8000)), // Max 8k chars per text model: EMBEDDING_MODEL, dimensions: EMBEDDING_DIMENSIONS }) }); if (!response.ok) { const error = await response.json(); throw new Error(`OpenAI API error: ${error.error?.message || response.status}`); } const data = await response.json(); // Sort by index to ensure order matches input const sorted = data.data.sort((a, b) => a.index - b.index); return { embeddings: sorted.map(item => item.embedding), usage: data.usage, model: data.model }; } // ============================================================================ // Vector Math // ============================================================================ function cosineSimilarity(a, b) { let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } function findTopMatches(queryEmbedding, categoryEmbeddings, topK = 10) { const scored = categoryEmbeddings.map(cat => ({ ...cat, similarity: cosineSimilarity(queryEmbedding, cat.embedding) })); scored.sort((a, b) => b.similarity - a.similarity); return scored.slice(0, topK); } // ============================================================================ // Database Functions // ============================================================================ async function fetchCategories(connection) { console.log('\nšŸ“‚ Fetching categories from database...'); // Fetch hierarchical categories (types 10-13) const [rows] = await connection.query(` SELECT cat_id, name, master_cat_id, type FROM product_categories WHERE type IN (10, 11, 12, 13) ORDER BY type, name `); console.log(` Found ${rows.length} category records`); // Build category paths const byId = new Map(rows.map(r => [r.cat_id, r])); const categories = []; for (const row of rows) { const path = []; let current = row; // Walk up the tree to build full path while (current) { path.unshift(current.name); current = current.master_cat_id ? byId.get(current.master_cat_id) : null; } categories.push({ id: row.cat_id, name: row.name, type: row.type, fullPath: path.join(' > '), embeddingText: path.join(' ') // For embedding generation }); } // Count by level const levels = { 10: categories.filter(c => c.type === 10).length, 11: categories.filter(c => c.type === 11).length, 12: categories.filter(c => c.type === 12).length, 13: categories.filter(c => c.type === 13).length, }; console.log(` Level breakdown: ${levels[10]} top-level, ${levels[11]} L2, ${levels[12]} L3, ${levels[13]} L4`); return categories; } // ============================================================================ // Main Script // ============================================================================ async function main() { console.log('═══════════════════════════════════════════════════════════════'); console.log(' EMBEDDING PROOF-OF-CONCEPT'); console.log(' Model: ' + EMBEDDING_MODEL); console.log('═══════════════════════════════════════════════════════════════'); if (!OPENAI_API_KEY) { console.error('āŒ OPENAI_API_KEY not found in environment'); process.exit(1); } let connection; try { // Step 1: Connect to database console.log('\nšŸ”Œ Connecting to database via SSH tunnel...'); const { connection: conn } = await getDbConnection(); connection = conn; console.log(' āœ… Connected'); // Step 2: Fetch categories const categories = await fetchCategories(connection); // Step 3: Generate embeddings for categories console.log('\n🧮 Generating embeddings for categories...'); console.log(' This will cost approximately $' + (categories.length * 0.00002).toFixed(4)); const startTime = Date.now(); // Process in batches of 100 (OpenAI limit is 2048) const BATCH_SIZE = 100; let totalTokens = 0; for (let i = 0; i < categories.length; i += BATCH_SIZE) { const batch = categories.slice(i, i + BATCH_SIZE); const texts = batch.map(c => c.embeddingText); const result = await getEmbeddings(texts); // Attach embeddings to categories for (let j = 0; j < batch.length; j++) { batch[j].embedding = result.embeddings[j]; } totalTokens += result.usage.total_tokens; console.log(` Batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(categories.length / BATCH_SIZE)}: ${batch.length} categories embedded`); } const embeddingTime = Date.now() - startTime; console.log(` āœ… Generated ${categories.length} embeddings in ${embeddingTime}ms`); console.log(` šŸ“Š Total tokens used: ${totalTokens} (~$${(totalTokens * 0.00002).toFixed(4)})`); // Step 4: Test with sample products console.log('\n═══════════════════════════════════════════════════════════════'); console.log(' TESTING WITH SAMPLE PRODUCTS'); console.log('═══════════════════════════════════════════════════════════════'); for (const product of TEST_PRODUCTS) { console.log('\nā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€'); console.log(`│ Product: "${product.name}"`); console.log(`│ Description: "${product.description.substring(0, 60)}..."`); console.log('ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€'); // Generate embedding for product const productText = `${product.name} ${product.description}`; const { embeddings: [productEmbedding] } = await getEmbeddings([productText]); // Find top matches const matches = findTopMatches(productEmbedding, categories, 10); console.log('│ Top 10 Category Matches:'); matches.forEach((match, i) => { const similarity = (match.similarity * 100).toFixed(1); const bar = 'ā–ˆ'.repeat(Math.round(match.similarity * 20)); const marker = i < 3 ? ' āœ…' : ''; console.log(`│ ${(i + 1).toString().padStart(2)}. [${similarity.padStart(5)}%] ${bar.padEnd(20)} ${match.fullPath}${marker}`); }); console.log('└─────────────────────────────────────────────────────────────'); } // Step 5: Summary console.log('\n═══════════════════════════════════════════════════════════════'); console.log(' SUMMARY'); console.log('═══════════════════════════════════════════════════════════════'); console.log(` Categories embedded: ${categories.length}`); console.log(` Embedding time: ${embeddingTime}ms (one-time cost)`); console.log(` Per-product lookup: ~${(Date.now() - startTime) / TEST_PRODUCTS.length}ms`); console.log(` Vector dimensions: ${EMBEDDING_DIMENSIONS}`); console.log(` Memory usage: ~${(categories.length * EMBEDDING_DIMENSIONS * 4 / 1024 / 1024).toFixed(2)} MB (in-memory vectors)`); console.log(''); console.log(' šŸ’” In production:'); console.log(' - Category embeddings are computed once and cached'); console.log(' - Only product embedding is computed per-request (~$0.00002)'); console.log(' - Vector search is instant (in-memory cosine similarity)'); console.log(' - Top 10 results go to AI for final selection (~$0.0001)'); console.log('═══════════════════════════════════════════════════════════════\n'); } catch (error) { console.error('\nāŒ Error:', error.message); if (error.stack) { console.error(error.stack); } process.exit(1); } finally { await closeAllConnections(); console.log('šŸ”Œ Database connections closed'); } } // Run the script main();