Update ai validation to use gpt-5 and the new responses api

This commit is contained in:
2025-10-01 22:18:26 -04:00
parent e10df632d8
commit 60875c25a6
7 changed files with 1184 additions and 494 deletions

View File

@@ -6,7 +6,7 @@ const path = require("path");
const dotenv = require("dotenv");
const mysql = require('mysql2/promise');
const { Client } = require('ssh2');
const { getDbConnection } = require('../utils/dbConnection'); // Import the optimized connection function
const { getDbConnection, closeAllConnections } = require('../utils/dbConnection'); // Import the optimized connection function
// Ensure environment variables are loaded
dotenv.config({ path: path.join(__dirname, "../../.env") });
@@ -19,6 +19,121 @@ if (!process.env.OPENAI_API_KEY) {
console.error("Warning: OPENAI_API_KEY is not set in environment variables");
}
async function createResponsesCompletion(payload) {
if (!openai.responses?.create) {
throw new Error(
"OpenAI client does not expose responses.create; please verify the openai SDK version."
);
}
return openai.responses.create(payload);
}
const AI_VALIDATION_SCHEMA_NAME = "ai_validation_response";
const FLEXIBLE_PRIMITIVE_SCHEMAS = [
{ type: "string" },
{ type: "number" },
{ type: "boolean" },
{ type: "null" },
];
const FLEXIBLE_ARRAY_SCHEMA = {
type: "array",
items: {
anyOf: FLEXIBLE_PRIMITIVE_SCHEMAS,
},
};
const FLEXIBLE_OBJECT_SCHEMA = {
type: "object",
properties: {},
patternProperties: {
".+": {
anyOf: [...FLEXIBLE_PRIMITIVE_SCHEMAS, FLEXIBLE_ARRAY_SCHEMA],
},
},
additionalProperties: false,
};
const FLEXIBLE_VALUE_SCHEMA = {
anyOf: [...FLEXIBLE_PRIMITIVE_SCHEMAS, FLEXIBLE_ARRAY_SCHEMA, FLEXIBLE_OBJECT_SCHEMA],
};
const AI_VALIDATION_JSON_SCHEMA = {
type: "object",
additionalProperties: false,
required: [
"correctedData",
"changes",
"warnings",
"summary",
"qualityNotes",
"nextSteps",
"metadata"
],
properties: {
correctedData: {
type: "array",
items: {
type: "object",
properties: {},
patternProperties: {
".+": FLEXIBLE_VALUE_SCHEMA,
},
additionalProperties: false,
},
},
changes: {
type: "array",
items: {
type: "string",
},
default: [],
},
warnings: {
type: "array",
items: {
type: "string",
},
default: [],
},
summary: {
type: "string",
default: "",
},
qualityNotes: {
type: "array",
items: {
type: "string",
},
default: [],
},
nextSteps: {
type: "array",
items: {
type: "string",
},
default: [],
},
metadata: {
type: "object",
properties: {},
patternProperties: {
".+": FLEXIBLE_VALUE_SCHEMA,
},
additionalProperties: false,
},
},
};
const AI_VALIDATION_TEXT_FORMAT = {
type: "json_schema",
name: AI_VALIDATION_SCHEMA_NAME,
strict: true,
schema: AI_VALIDATION_JSON_SCHEMA,
};
// Debug endpoint for viewing prompt
router.post("/debug", async (req, res) => {
try {
@@ -139,6 +254,12 @@ router.post("/debug", async (req, res) => {
code: error.code || null,
name: error.name || null
});
} finally {
try {
await closeAllConnections();
} catch (closeError) {
console.error("⚠️ Failed to close DB connections after debug request:", closeError);
}
}
});
@@ -402,8 +523,9 @@ async function generateDebugResponse(productsToUse, res) {
console.log("Sending response with taxonomy stats:", response.taxonomyStats);
return response;
} finally {
if (promptConnection) await promptConnection.end();
} catch (promptLoadError) {
console.error("Error loading prompt:", promptLoadError);
throw promptLoadError;
}
} catch (error) {
console.error("Error generating debug response:", error);
@@ -883,34 +1005,80 @@ router.post("/validate", async (req, res) => {
console.log("🔄 Loading prompt with filtered taxonomy...");
const promptData = await loadPrompt(connection, products, req.app.locals.pool);
const fullUserPrompt = promptData.userContent + "\n" + JSON.stringify(products);
const promptLength = promptData.systemInstructions.length + fullUserPrompt.length; // Store prompt length for performance metrics
promptLength = promptData.systemInstructions.length + fullUserPrompt.length; // Store prompt length for performance metrics
console.log("📝 Generated prompt length:", promptLength);
console.log("📝 System instructions length:", promptData.systemInstructions.length);
console.log("📝 User content length:", fullUserPrompt.length);
console.log("🤖 Sending request to OpenAI...");
const completion = await openai.chat.completions.create({
model: "gpt-4o",
messages: [
console.log("🤖 Sending request to OpenAI Responses API...");
// GPT-5 Responses API Configuration:
// - Using "gpt-5" (reasoning model) for complex product validation
// - reasoning.effort: "medium" balances quality and speed (minimal, low, medium, high)
// - text.verbosity: "medium" provides balanced output detail (low, medium, high)
// - max_output_tokens: 20000 ensures space for large product batches
// Note: Responses API is the recommended endpoint for GPT-5 models
const completion = await createResponsesCompletion({
model: "gpt-5",
input: [
{
role: "system",
content: promptData.systemInstructions,
role: "developer",
content: `${promptData.systemInstructions}\n\nYou MUST respond with a single valid JSON object containing the following top-level keys: correctedData, changes, warnings, summary, qualityNotes, nextSteps, metadata.\n- correctedData: array of product objects reflecting the updated data.\n- changes: array of human-readable bullet points summarizing the nature of updates.\n- warnings: array of caveats or risks that still require review.\n- summary: a concise paragraph (<=75 words) describing overall data quality and improvements.\n- qualityNotes: array of short comments (<=40 words each) about validation quality or notable observations.\n- nextSteps: array of recommended manual follow-up actions (if none, provide an empty array).\n- metadata: object containing any supplemental machine-readable information (optional fields allowed).\nDo NOT include Markdown code fences or any text outside the JSON object.`,
},
{
role: "user",
content: fullUserPrompt,
},
],
temperature: 0.2,
response_format: { type: "json_object" },
reasoning: {
effort: "medium"
},
text: {
verbosity: "medium",
format: AI_VALIDATION_TEXT_FORMAT,
},
max_output_tokens: 20000,
});
console.log("✅ Received response from OpenAI");
const rawResponse = completion.choices[0].message.content;
console.log("📄 Raw AI response length:", rawResponse.length);
console.log("✅ Received response from OpenAI Responses API");
// Responses API structure: response has 'output' array with message objects
const rawResponse = extractResponseText(completion);
console.log("📄 Raw AI response length:", rawResponse ? rawResponse.length : 0);
if (!rawResponse) {
throw new Error("OpenAI response did not include any text output");
}
const responseModel = completion.model;
const usage = completion.usage || {};
// GPT-5 Responses API provides detailed token usage including reasoning tokens
const tokenUsageSummary = {
prompt: usage.input_tokens ?? usage.prompt_tokens ?? null,
completion: usage.output_tokens ?? usage.completion_tokens ?? null,
total: usage.total_tokens ?? null,
// GPT-5 reasoning tokens are in output_tokens_details
reasoning: usage.output_tokens_details?.reasoning_tokens ?? usage.completion_tokens_details?.reasoning_tokens ?? null,
// Also capture text generation tokens separately from reasoning
textGeneration: usage.output_tokens_details?.text_generation_tokens ?? usage.completion_tokens_details?.text_generation_tokens ?? null,
cachedPrompt: usage.input_tokens_details?.cached_tokens ?? usage.prompt_tokens_details?.cached_tokens ?? null,
// Capture audio tokens if present (future GPT-5 feature)
audioTokens: usage.output_tokens_details?.audio_tokens ?? usage.completion_tokens_details?.audio_tokens ?? null,
};
// Extract reasoning_effort and verbosity that were actually applied
const reasoningEffortApplied = completion.reasoning?.effort || "medium";
const verbosityApplied = completion.text?.verbosity || "medium";
console.log("📊 Token usage summary:", tokenUsageSummary);
console.log("🤖 Model dispatched:", responseModel);
console.log("🧠 Reasoning effort applied:", reasoningEffortApplied);
console.log("📝 Verbosity applied:", verbosityApplied);
try {
const aiResponse = JSON.parse(rawResponse);
const normalizedResponse = normalizeJsonResponse(rawResponse);
const aiResponse = JSON.parse(normalizedResponse);
console.log(
"🔄 Parsed AI response with keys:",
Object.keys(aiResponse)
@@ -975,7 +1143,12 @@ router.post("/validate", async (req, res) => {
const endTime = new Date();
let performanceMetrics = {
promptLength,
productCount: products.length
productCount: products.length,
model: responseModel,
tokenUsage: tokenUsageSummary,
reasoningTokens: tokenUsageSummary.reasoning,
reasoningEffort: reasoningEffortApplied,
verbosity: verbosityApplied,
};
try {
@@ -1040,83 +1213,78 @@ router.post("/validate", async (req, res) => {
let promptSources = null;
try {
// Get system prompt
const systemPromptResult = await pool.query(`
SELECT * FROM ai_prompts WHERE prompt_type = 'system'
`);
// Get general prompt
const generalPromptResult = await pool.query(`
SELECT * FROM ai_prompts WHERE prompt_type = 'general'
`);
// Extract unique company IDs from products
const companyIds = new Set();
products.forEach(product => {
if (product.company) {
companyIds.add(String(product.company));
}
});
let companyPrompts = [];
if (companyIds.size > 0) {
// Fetch company-specific prompts
const companyPromptsResult = await pool.query(`
SELECT * FROM ai_prompts
WHERE prompt_type = 'company_specific'
AND company = ANY($1)
`, [Array.from(companyIds)]);
companyPrompts = companyPromptsResult.rows;
}
// Find company names from taxonomy for the validation endpoint
const companyPromptsWithNames = companyPrompts.map(prompt => {
let companyName = "Unknown Company";
if (taxonomy.companies && Array.isArray(taxonomy.companies)) {
const companyData = taxonomy.companies.find(company =>
String(company[0]) === String(prompt.company)
);
if (companyData && companyData[1]) {
companyName = companyData[1];
// Use the local PostgreSQL pool from the app
const pool = req.app.locals.pool;
if (!pool) {
console.warn("⚠️ Local database pool not available for prompt sources");
} else {
// Get system prompt
const systemPromptResult = await pool.query(`
SELECT * FROM ai_prompts WHERE prompt_type = 'system'
`);
// Get general prompt
const generalPromptResult = await pool.query(`
SELECT * FROM ai_prompts WHERE prompt_type = 'general'
`);
// Extract unique company IDs from products
const companyIds = new Set();
products.forEach(product => {
if (product.company) {
companyIds.add(String(product.company));
}
});
let companyPrompts = [];
if (companyIds.size > 0) {
// Fetch company-specific prompts
const companyPromptsResult = await pool.query(`
SELECT * FROM ai_prompts
WHERE prompt_type = 'company_specific'
AND company = ANY($1)
`, [Array.from(companyIds)]);
companyPrompts = companyPromptsResult.rows;
}
return {
// Format company prompts for response
// Note: Company names would require re-fetching taxonomy data
// For now, we include company ID only
const companyPromptsWithNames = companyPrompts.map(prompt => ({
id: prompt.id,
company: prompt.company,
companyName: companyName,
prompt_text: prompt.prompt_text
};
});
// Set prompt sources
if (generalPromptResult.rows.length > 0) {
const generalPrompt = generalPromptResult.rows[0];
let systemPrompt = null;
if (systemPromptResult.rows.length > 0) {
systemPrompt = systemPromptResult.rows[0];
}));
// Set prompt sources
if (generalPromptResult.rows.length > 0) {
const generalPrompt = generalPromptResult.rows[0];
let systemPrompt = null;
if (systemPromptResult.rows.length > 0) {
systemPrompt = systemPromptResult.rows[0];
}
promptSources = {
...(systemPrompt ? {
systemPrompt: {
id: systemPrompt.id,
prompt_text: systemPrompt.prompt_text
}
} : {
systemPrompt: {
id: 0,
prompt_text: `You are a specialized e-commerce product data processor for a crafting supplies website tasked with providing complete, correct, appealing, and SEO-friendly product listings. You should write professionally, but in a friendly and engaging tone. You have meticulous attention to detail and are a master at your craft.`
}
}),
generalPrompt: {
id: generalPrompt.id,
prompt_text: generalPrompt.prompt_text
},
companyPrompts: companyPromptsWithNames
};
}
promptSources = {
...(systemPrompt ? {
systemPrompt: {
id: systemPrompt.id,
prompt_text: systemPrompt.prompt_text
}
} : {
systemPrompt: {
id: 0,
prompt_text: `You are a specialized e-commerce product data processor for a crafting supplies website tasked with providing complete, correct, appealing, and SEO-friendly product listings. You should write professionally, but in a friendly and engaging tone. You have meticulous attention to detail and are a master at your craft.`
}
}),
generalPrompt: {
id: generalPrompt.id,
prompt_text: generalPrompt.prompt_text
},
companyPrompts: companyPromptsWithNames
};
}
} catch (promptSourceError) {
console.error("⚠️ Error getting prompt sources:", promptSourceError);
@@ -1126,16 +1294,26 @@ router.post("/validate", async (req, res) => {
// Include prompt sources in the response
res.json({
success: true,
changeDetails: changeDetails,
performanceMetrics: performanceMetrics || {
// Fallback: calculate a simple estimate
promptLength: promptLength,
processingTimeSeconds: Math.max(15, Math.round(promptLength / 1000)),
isEstimate: true,
productCount: products.length
},
promptSources: promptSources,
...aiResponse,
changeDetails,
performanceMetrics:
performanceMetrics || {
// Fallback: calculate a simple estimate
promptLength,
processingTimeSeconds: Math.max(15, Math.round(promptLength / 1000)),
isEstimate: true,
productCount: products.length,
model: responseModel,
tokenUsage: tokenUsageSummary,
reasoningTokens: tokenUsageSummary.reasoning,
reasoningEffort: reasoningEffortApplied,
verbosity: verbosityApplied,
},
promptSources,
model: responseModel,
tokenUsage: tokenUsageSummary,
reasoningEffort: reasoningEffortApplied,
verbosity: verbosityApplied,
});
} catch (parseError) {
console.error("❌ Error parsing AI response:", parseError);
@@ -1151,10 +1329,6 @@ router.post("/validate", async (req, res) => {
success: false,
error: "OpenAI API Error: " + openaiError.message,
});
} finally {
// Clean up database connection and SSH tunnel
if (connection) await connection.end();
if (ssh) ssh.end();
}
} catch (error) {
console.error("❌ AI Validation Error:", error);
@@ -1167,6 +1341,12 @@ router.post("/validate", async (req, res) => {
success: false,
error: error.message || "Error during AI validation",
});
} finally {
try {
await closeAllConnections();
} catch (closeError) {
console.error("⚠️ Failed to close DB connections after validation request:", closeError);
}
}
});
@@ -1249,8 +1429,11 @@ router.get("/test-taxonomy", async (req, res) => {
timestamp: new Date().toISOString()
});
} finally {
if (connection) await connection.end();
if (ssh) ssh.end();
try {
await closeAllConnections();
} catch (closeError) {
console.error("⚠️ Failed to close DB connections after test-taxonomy request:", closeError);
}
}
} catch (error) {
console.error("Test taxonomy endpoint error:", error);
@@ -1262,3 +1445,99 @@ router.get("/test-taxonomy", async (req, res) => {
});
module.exports = router;
function extractResponseText(response) {
if (!response) return "";
const outputs = [];
if (Array.isArray(response.output)) {
outputs.push(...response.output);
}
if (Array.isArray(response.outputs)) {
outputs.push(...response.outputs);
}
const segments = outputs.flatMap((output) => collectTextSegments(output?.content ?? output));
if (segments.length === 0 && typeof response.output_text === "string") {
segments.push(response.output_text);
}
if (segments.length === 0 && response.choices?.length) {
segments.push(
...collectTextSegments(response.choices?.[0]?.message?.content)
);
}
const text = segments.join("").trim();
return text;
}
function collectTextSegments(node) {
if (node == null) return [];
if (typeof node === "string" || typeof node === "number" || typeof node === "boolean") {
return [String(node)];
}
if (Array.isArray(node)) {
return node.flatMap(collectTextSegments);
}
if (typeof node !== "object") {
return [];
}
const segments = [];
if (typeof node.text === "string") {
segments.push(node.text);
} else if (Array.isArray(node.text)) {
segments.push(...node.text.flatMap(collectTextSegments));
}
if (typeof node.content === "string") {
segments.push(node.content);
} else if (Array.isArray(node.content)) {
segments.push(...node.content.flatMap(collectTextSegments));
}
if (typeof node.output_text === "string") {
segments.push(node.output_text);
} else if (Array.isArray(node.output_text)) {
segments.push(...node.output_text.flatMap(collectTextSegments));
}
if (typeof node.value === "string") {
segments.push(node.value);
}
if (typeof node.data === "string") {
segments.push(node.data);
}
return segments;
}
function normalizeJsonResponse(text) {
if (!text || typeof text !== 'string') return text;
let cleaned = text.trim();
if (cleaned.startsWith('```')) {
const firstLineBreak = cleaned.indexOf('\n');
if (firstLineBreak !== -1) {
cleaned = cleaned.substring(firstLineBreak + 1);
} else {
cleaned = cleaned.replace(/^```/, '');
}
const closingFenceIndex = cleaned.lastIndexOf('```');
if (closingFenceIndex !== -1) {
cleaned = cleaned.substring(0, closingFenceIndex);
}
cleaned = cleaned.trim();
}
return cleaned;
}