Add AI embeddings and suggestions for categories, a few validation step tweaks/fixes
This commit is contained in:
283
inventory-server/scripts/embedding-poc.js
Normal file
283
inventory-server/scripts/embedding-poc.js
Normal file
@@ -0,0 +1,283 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Embedding Proof-of-Concept Script
|
||||
*
|
||||
* Demonstrates how category embeddings work for product matching.
|
||||
* Uses OpenAI text-embedding-3-small model.
|
||||
*
|
||||
* Usage: node scripts/embedding-poc.js
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
require('dotenv').config({ path: path.join(__dirname, '../.env') });
|
||||
|
||||
const { getDbConnection, closeAllConnections } = require('../src/utils/dbConnection');
|
||||
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
|
||||
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
|
||||
const EMBEDDING_MODEL = 'text-embedding-3-small';
|
||||
const EMBEDDING_DIMENSIONS = 1536;
|
||||
|
||||
// Sample products to test (you can modify these)
|
||||
const TEST_PRODUCTS = [
|
||||
{
|
||||
name: "Cosmos Infinity Chipboard - Stamperia",
|
||||
description: "Laser-cut chipboard shapes featuring celestial designs for mixed media projects"
|
||||
},
|
||||
{
|
||||
name: "Distress Oxide Ink Pad - Mermaid Lagoon",
|
||||
description: "Water-reactive dye ink that creates an oxidized effect"
|
||||
},
|
||||
{
|
||||
name: "Hedwig Puffy Stickers - Paper House Productions",
|
||||
description: "3D puffy stickers featuring Harry Potter's owl Hedwig"
|
||||
},
|
||||
{
|
||||
name: "Black Velvet Watercolor Brush Size 6",
|
||||
description: "Round brush for watercolor painting with synthetic bristles"
|
||||
},
|
||||
{
|
||||
name: "Floral Washi Tape Set",
|
||||
description: "Decorative paper tape with flower patterns, pack of 6 rolls"
|
||||
}
|
||||
];
|
||||
|
||||
// ============================================================================
|
||||
// OpenAI Embedding Functions
|
||||
// ============================================================================
|
||||
|
||||
async function getEmbeddings(texts) {
|
||||
const response = await fetch('https://api.openai.com/v1/embeddings', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${OPENAI_API_KEY}`
|
||||
},
|
||||
body: JSON.stringify({
|
||||
input: texts.map(t => t.substring(0, 8000)), // Max 8k chars per text
|
||||
model: EMBEDDING_MODEL,
|
||||
dimensions: EMBEDDING_DIMENSIONS
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(`OpenAI API error: ${error.error?.message || response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
// Sort by index to ensure order matches input
|
||||
const sorted = data.data.sort((a, b) => a.index - b.index);
|
||||
|
||||
return {
|
||||
embeddings: sorted.map(item => item.embedding),
|
||||
usage: data.usage,
|
||||
model: data.model
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Vector Math
|
||||
// ============================================================================
|
||||
|
||||
function cosineSimilarity(a, b) {
|
||||
let dotProduct = 0;
|
||||
let normA = 0;
|
||||
let normB = 0;
|
||||
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dotProduct += a[i] * b[i];
|
||||
normA += a[i] * a[i];
|
||||
normB += b[i] * b[i];
|
||||
}
|
||||
|
||||
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
||||
}
|
||||
|
||||
function findTopMatches(queryEmbedding, categoryEmbeddings, topK = 10) {
|
||||
const scored = categoryEmbeddings.map(cat => ({
|
||||
...cat,
|
||||
similarity: cosineSimilarity(queryEmbedding, cat.embedding)
|
||||
}));
|
||||
|
||||
scored.sort((a, b) => b.similarity - a.similarity);
|
||||
|
||||
return scored.slice(0, topK);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Database Functions
|
||||
// ============================================================================
|
||||
|
||||
async function fetchCategories(connection) {
|
||||
console.log('\n📂 Fetching categories from database...');
|
||||
|
||||
// Fetch hierarchical categories (types 10-13)
|
||||
const [rows] = await connection.query(`
|
||||
SELECT
|
||||
cat_id,
|
||||
name,
|
||||
master_cat_id,
|
||||
type
|
||||
FROM product_categories
|
||||
WHERE type IN (10, 11, 12, 13)
|
||||
ORDER BY type, name
|
||||
`);
|
||||
|
||||
console.log(` Found ${rows.length} category records`);
|
||||
|
||||
// Build category paths
|
||||
const byId = new Map(rows.map(r => [r.cat_id, r]));
|
||||
const categories = [];
|
||||
|
||||
for (const row of rows) {
|
||||
const path = [];
|
||||
let current = row;
|
||||
|
||||
// Walk up the tree to build full path
|
||||
while (current) {
|
||||
path.unshift(current.name);
|
||||
current = current.master_cat_id ? byId.get(current.master_cat_id) : null;
|
||||
}
|
||||
|
||||
categories.push({
|
||||
id: row.cat_id,
|
||||
name: row.name,
|
||||
type: row.type,
|
||||
fullPath: path.join(' > '),
|
||||
embeddingText: path.join(' ') // For embedding generation
|
||||
});
|
||||
}
|
||||
|
||||
// Count by level
|
||||
const levels = {
|
||||
10: categories.filter(c => c.type === 10).length,
|
||||
11: categories.filter(c => c.type === 11).length,
|
||||
12: categories.filter(c => c.type === 12).length,
|
||||
13: categories.filter(c => c.type === 13).length,
|
||||
};
|
||||
|
||||
console.log(` Level breakdown: ${levels[10]} top-level, ${levels[11]} L2, ${levels[12]} L3, ${levels[13]} L4`);
|
||||
|
||||
return categories;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Main Script
|
||||
// ============================================================================
|
||||
|
||||
async function main() {
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(' EMBEDDING PROOF-OF-CONCEPT');
|
||||
console.log(' Model: ' + EMBEDDING_MODEL);
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
|
||||
if (!OPENAI_API_KEY) {
|
||||
console.error('❌ OPENAI_API_KEY not found in environment');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let connection;
|
||||
|
||||
try {
|
||||
// Step 1: Connect to database
|
||||
console.log('\n🔌 Connecting to database via SSH tunnel...');
|
||||
const { connection: conn } = await getDbConnection();
|
||||
connection = conn;
|
||||
console.log(' ✅ Connected');
|
||||
|
||||
// Step 2: Fetch categories
|
||||
const categories = await fetchCategories(connection);
|
||||
|
||||
// Step 3: Generate embeddings for categories
|
||||
console.log('\n🧮 Generating embeddings for categories...');
|
||||
console.log(' This will cost approximately $' + (categories.length * 0.00002).toFixed(4));
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// Process in batches of 100 (OpenAI limit is 2048)
|
||||
const BATCH_SIZE = 100;
|
||||
let totalTokens = 0;
|
||||
|
||||
for (let i = 0; i < categories.length; i += BATCH_SIZE) {
|
||||
const batch = categories.slice(i, i + BATCH_SIZE);
|
||||
const texts = batch.map(c => c.embeddingText);
|
||||
|
||||
const result = await getEmbeddings(texts);
|
||||
|
||||
// Attach embeddings to categories
|
||||
for (let j = 0; j < batch.length; j++) {
|
||||
batch[j].embedding = result.embeddings[j];
|
||||
}
|
||||
|
||||
totalTokens += result.usage.total_tokens;
|
||||
console.log(` Batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(categories.length / BATCH_SIZE)}: ${batch.length} categories embedded`);
|
||||
}
|
||||
|
||||
const embeddingTime = Date.now() - startTime;
|
||||
console.log(` ✅ Generated ${categories.length} embeddings in ${embeddingTime}ms`);
|
||||
console.log(` 📊 Total tokens used: ${totalTokens} (~$${(totalTokens * 0.00002).toFixed(4)})`);
|
||||
|
||||
// Step 4: Test with sample products
|
||||
console.log('\n═══════════════════════════════════════════════════════════════');
|
||||
console.log(' TESTING WITH SAMPLE PRODUCTS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
|
||||
for (const product of TEST_PRODUCTS) {
|
||||
console.log('\n┌─────────────────────────────────────────────────────────────');
|
||||
console.log(`│ Product: "${product.name}"`);
|
||||
console.log(`│ Description: "${product.description.substring(0, 60)}..."`);
|
||||
console.log('├─────────────────────────────────────────────────────────────');
|
||||
|
||||
// Generate embedding for product
|
||||
const productText = `${product.name} ${product.description}`;
|
||||
const { embeddings: [productEmbedding] } = await getEmbeddings([productText]);
|
||||
|
||||
// Find top matches
|
||||
const matches = findTopMatches(productEmbedding, categories, 10);
|
||||
|
||||
console.log('│ Top 10 Category Matches:');
|
||||
matches.forEach((match, i) => {
|
||||
const similarity = (match.similarity * 100).toFixed(1);
|
||||
const bar = '█'.repeat(Math.round(match.similarity * 20));
|
||||
const marker = i < 3 ? ' ✅' : '';
|
||||
console.log(`│ ${(i + 1).toString().padStart(2)}. [${similarity.padStart(5)}%] ${bar.padEnd(20)} ${match.fullPath}${marker}`);
|
||||
});
|
||||
console.log('└─────────────────────────────────────────────────────────────');
|
||||
}
|
||||
|
||||
// Step 5: Summary
|
||||
console.log('\n═══════════════════════════════════════════════════════════════');
|
||||
console.log(' SUMMARY');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(` Categories embedded: ${categories.length}`);
|
||||
console.log(` Embedding time: ${embeddingTime}ms (one-time cost)`);
|
||||
console.log(` Per-product lookup: ~${(Date.now() - startTime) / TEST_PRODUCTS.length}ms`);
|
||||
console.log(` Vector dimensions: ${EMBEDDING_DIMENSIONS}`);
|
||||
console.log(` Memory usage: ~${(categories.length * EMBEDDING_DIMENSIONS * 4 / 1024 / 1024).toFixed(2)} MB (in-memory vectors)`);
|
||||
console.log('');
|
||||
console.log(' 💡 In production:');
|
||||
console.log(' - Category embeddings are computed once and cached');
|
||||
console.log(' - Only product embedding is computed per-request (~$0.00002)');
|
||||
console.log(' - Vector search is instant (in-memory cosine similarity)');
|
||||
console.log(' - Top 10 results go to AI for final selection (~$0.0001)');
|
||||
console.log('═══════════════════════════════════════════════════════════════\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('\n❌ Error:', error.message);
|
||||
if (error.stack) {
|
||||
console.error(error.stack);
|
||||
}
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await closeAllConnections();
|
||||
console.log('🔌 Database connections closed');
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
main();
|
||||
Reference in New Issue
Block a user