前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出

2026-03-19 01:51:34 +08:00
parent c23b93bb70
commit 2f630695ff
194 changed files with 23354 additions and 174 deletions
--- a/frontend/supabase/functions/process-document/index.ts
+++ b/frontend/supabase/functions/process-document/index.ts
@@ -0,0 +1,146 @@
+import { serve } from 'https://deno.land/std@0.168.0/http/server.ts';
+import { createClient } from 'https://esm.sh/@supabase/supabase-js@2';
+import { corsHeaders } from '../_shared/cors.ts';
+import * as XLSX from 'https://esm.sh/xlsx@0.18.5';
+import * as mammoth from 'https://esm.sh/mammoth@1.5.1';
+
+serve(async (req) => {
+  if (req.method === 'OPTIONS') {
+    return new Response(null, { headers: corsHeaders });
+  }
+
+  try {
+    const { documentId } = await req.json();
+    if (!documentId) throw new Error('Missing documentId');
+
+    const supabaseUrl = Deno.env.get('SUPABASE_URL')!;
+    const supabaseKey = Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!;
+    const supabase = createClient(supabaseUrl, supabaseKey);
+
+    // Get document details
+    const { data: document, error: docError } = await supabase
+      .from('documents')
+      .select('*')
+      .eq('id', documentId)
+      .single();
+
+    if (docError || !document) throw new Error('Document not found');
+
+    // Download file
+    const { data: fileData, error: dlError } = await supabase.storage
+      .from('document_storage')
+      .download(document.storage_path);
+
+    if (dlError || !fileData) throw new Error('Failed to download file');
+
+    const arrayBuffer = await fileData.arrayBuffer();
+    const uint8Array = new Uint8Array(arrayBuffer);
+    let extractedText = '';
+
+    // Parse file based on type
+    const fileExt = document.type.toLowerCase();
+    if (fileExt === 'txt' || fileExt === 'md') {
+      extractedText = new TextDecoder().decode(uint8Array);
+    } else if (fileExt === 'docx') {
+      const result = await mammoth.extractRawText({ arrayBuffer });
+      extractedText = result.value;
+    } else if (fileExt === 'xlsx' || fileExt === 'xls') {
+      const workbook = XLSX.read(arrayBuffer, { type: 'array' });
+      extractedText = workbook.SheetNames.map(name => {
+        const sheet = workbook.Sheets[name];
+        return XLSX.utils.sheet_to_txt(sheet);
+      }).join('\n\n');
+    } else {
+      throw new Error(`Unsupported file type: ${fileExt}`);
+    }
+
+    if (!extractedText.trim()) throw new Error('Document is empty');
+
+    // Call MiniMax for entity extraction
+    const miniMaxApiKey = Deno.env.get('INTEGRATIONS_API_KEY');
+    const miniMaxResponse = await fetch(
+      'https://app-a6ww9j3ja3nl-api-Aa2PqMJnJGwL-gateway.appmiaoda.com/v1/text/chatcompletion_v2',
+      {
+        method: 'POST',
+        headers: {
+          'X-Gateway-Authorization': `Bearer ${miniMaxApiKey}`,
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          model: 'MiniMax-M2.5',
+          messages: [
+            {
+              role: 'system',
+              content: '你是一个文档信息提取专家。请从提供的文档内容中提取关键实体信息（如姓名、日期、金额、项目名称、地址、关键指标等）。输出格式必须为 JSON 数组，包含 entity_type, entity_value, confidence 三个字段。'
+            },
+            {
+              role: 'user',
+              content: `文档内容如下：\n\n${extractedText.slice(0, 15000)}` // Limit text for token budget
+            }
+          ],
+          response_format: {
+            type: 'json_schema',
+            json_schema: {
+              name: 'extracted_entities',
+              schema: {
+                type: 'object',
+                properties: {
+                  entities: {
+                    type: 'array',
+                    items: {
+                      type: 'object',
+                      properties: {
+                        entity_type: { type: 'string' },
+                        entity_value: { type: 'string' },
+                        confidence: { type: 'number' }
+                      },
+                      required: ['entity_type', 'entity_value']
+                    }
+                  }
+                }
+              }
+            }
+          }
+        })
+      }
+    );
+
+    const miniMaxData = await miniMaxResponse.json();
+    if (!miniMaxResponse.ok) {
+      console.error('MiniMax Error:', miniMaxData);
+      throw new Error('MiniMax extraction failed');
+    }
+
+    const extractionResult = JSON.parse(miniMaxData.choices[0].message.content);
+    const entities = extractionResult.entities || [];
+
+    // Save entities
+    if (entities.length > 0) {
+      const entitiesToInsert = entities.map((e: any) => ({
+        document_id: documentId,
+        entity_type: e.entity_type,
+        entity_value: e.entity_value,
+        confidence: e.confidence || 1.0
+      }));
+
+      await supabase.from('extracted_entities').insert(entitiesToInsert);
+    }
+
+    // Update document
+    await supabase.from('documents').update({
+      content_text: extractedText,
+      status: 'completed'
+    }).eq('id', documentId);
+
+    return new Response(JSON.stringify({ success: true, entities }), {
+      headers: { ...corsHeaders, 'Content-Type': 'application/json' }
+    });
+
+  } catch (error) {
+    console.error('Error processing document:', error);
+    return new Response(JSON.stringify({ error: error.message }), {
+      status: 500,
+      headers: { ...corsHeaders, 'Content-Type': 'application/json' }
+    });
+  }
+});