From 6b88e971e8a72a24877ba3d853cab229963a1179 Mon Sep 17 00:00:00 2001 From: KiriAky 107 Date: Thu, 26 Mar 2026 23:41:03 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=8E=E7=AB=AF=E5=AE=8C=E6=88=90=E5=BC=82?= =?UTF-8?q?=E6=AD=A5=E5=92=8Crag=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/.env.example | 4 +- .../api/__pycache__/__init__.cpython-312.pyc | Bin 726 -> 1276 bytes .../__pycache__/documents.cpython-312.pyc | Bin 0 -> 14740 bytes backend/app/api/endpoints/documents.py | 203 ++++---- backend/app/api/endpoints/templates.py | 5 +- backend/app/config.py | 2 +- .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 577 bytes .../__pycache__/mysql.cpython-312.pyc | Bin 0 -> 9688 bytes .../__pycache__/__init__.cpython-312.pyc | Bin 379 -> 2873 bytes .../__pycache__/docx_parser.cpython-312.pyc | Bin 0 -> 5713 bytes .../__pycache__/md_parser.cpython-312.pyc | Bin 0 -> 9774 bytes .../__pycache__/txt_parser.cpython-312.pyc | Bin 0 -> 8798 bytes backend/app/services/excel_storage_service.py | 144 +++++ backend/app/services/table_rag_service.py | 491 ++++++++++++++++++ 14 files changed, 741 insertions(+), 108 deletions(-) create mode 100644 backend/app/api/endpoints/__pycache__/documents.cpython-312.pyc create mode 100644 backend/app/core/database/__pycache__/__init__.cpython-312.pyc create mode 100644 backend/app/core/database/__pycache__/mysql.cpython-312.pyc create mode 100644 backend/app/core/document_parser/__pycache__/docx_parser.cpython-312.pyc create mode 100644 backend/app/core/document_parser/__pycache__/md_parser.cpython-312.pyc create mode 100644 backend/app/core/document_parser/__pycache__/txt_parser.cpython-312.pyc create mode 100644 backend/app/services/table_rag_service.py diff --git a/backend/.env.example b/backend/.env.example index 7e23672..b10fd86 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -37,8 +37,8 @@ LLM_MODEL_NAME="MiniMax-Text-01" # 上传文件存储目录 (相对于项目根目录) UPLOAD_DIR="./data/uploads" -# ChromaDB 向量数据库持久化目录 -CHROMADB_PERSIST_DIR="./data/chromadb" +# Faiss 向量数据库持久化目录 (LangChain + Faiss 实现) +FAISS_INDEX_DIR="./data/faiss" # ==================== RAG 配置 ==================== # Embedding 模型名称 diff --git a/backend/app/api/__pycache__/__init__.cpython-312.pyc b/backend/app/api/__pycache__/__init__.cpython-312.pyc index ee51b9af6c205394b3f19d71c5e7fec28f69f962..1a3fce860c6ed4c2db23ae817bd52e14bf888641 100644 GIT binary patch literal 1276 zcmbVLF>ljA6uxs3J9grr4ye>ZPzjZiG)c=^AtV-57O2W*B3ak?AlLeA>uiJ4iHQY9 zq>=$eY(*UqV&E4rF+~bWbt`HMTdN9H>cm~!$O~ogN&fuacklb|o$UABGz}ou&h$63 zln3CaOvWpB9vt}!0Gq%CHgKV!nSA}^rEwrOOCvBf08{U7_8Yy% z5Zu}U#)V9EWYq>%Z3yPls+oShS094el$q@}d(9!Jrp%3gttV77Df2I<+Q^z4SaU;A dPwN^Zt2wZQ+kDE+kF14(weSPnW|zfX{{px+`)U9H delta 225 zcmeyvd5x9tG%qg~0}#wK*pb=IIFV0+Nr-WxhL#p10~13k(`t|u5JYjLa%2fZ*vMom zOP0taMp;ShlB{4!W}xI`1x8UuJSy2iDrF{5V>01nxy4pmkdvR7GWj2q4zr&o_heIM zyU7!oi&O-GHZuZoaUPKPz|6?V_>qZ;k>x8J1Ebu1hJecq0Uy~W8?dNOKFVUu!Vgpf E0OqVER{#J2 diff --git a/backend/app/api/endpoints/__pycache__/documents.cpython-312.pyc b/backend/app/api/endpoints/__pycache__/documents.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec1367a1f0a24ea3d6503d23a64bfd9316e6d863 GIT binary patch literal 14740 zcmb_@eN-FQweO5Zqi-Y-2!Q}0K8z9AU~Eim@JF!?F<|4wA4x-;=m8@;5P|4OijA-E zbsZupHl_iD#>OXB_^J?KK6l2!NB;P|gA$GqMo?ON~c zeMTb*MZ~%5-Luf1J?H!EIp^%(-sf;5B}GF)$k}zhD|aPD{RLm7po=i~j>;(NG{sSE z6i0J17u`mar>so|PsYXc$lK&SY#U2sy49=dZF<5hT!x<1wp79^U1>eWHe*kETRM4HxiWf8Z6?C2U70=RHZ$QhuB@Kywrs*{ zU6!7lww#{awp^N$QBJzMR_c80>9k&;#kaP6$fM+RT*`j>qW%Jm3PYPsMmeOFgBC!P|HVtBpaj}I?Tla%vj(EzYI}0vmNHsz$YR)9J3U692lQUn;ic8mWGA=tI zhUP4>n0<0C$Eo9TohmNRSM;(gjjliCZN za;FMwh1QDVYAof7olCeS@GF7eQuviR?a-?In7plwq+v;Wq+}VymUHEhYZ?5OJ9Dwb zJ{4C1@yizJTjc`jn79>jBd4+DeKM{pu1CxJW%iZ6m!PYfxu3o;_xc;R-+BG^(DSzX zhnj42|Mb@E$u~5bxgWnfcjEiAZ@xS?a%uL`Nikn3=k7dMdC=uKSlPo>dJlT-x5i(* zb!p@;N522r&)%4OZ3vjzw})qsj?Vt*#LoH$Z~yY2XD|IkbLW-k=LUyt4F@}&E?YF+ zR^Md1{o`|xGW+)Fxp)3;?&QhOem$nSgI}W6E)(S2x;$P%wWH75*5qj_i{&D-7q zg%(DdoH#m%$Logp_!?6lYUiQ;#`aD~>KC-+b*Izgb9oObD}7`bKHMEd5sLzao+$cw z=f19kg52A_-_wagkwRoqQTJ*f<}~G`U<;wzWP2zle7vg_3acEB-u51+ z!y#xKjvhDX19>?y7mI!=Z5m0y)X}bJx<^% z`uce@)I~@xFrG{x&rzY2)MvMaQq!N^A#ulc@I_L_RcoF)>djKpc<@~I*pK`j_@Qtl$6l|BTY1(gU2QQkamtAeVJ zckk!nOSJM9k_bFZAAxo9yqmY{cpNyNhs25`ABhb~DUt#tSdf<^u>%q0Tvw-;FT?jj zBt$gD$dyP0qdeaMB1uzcc(x^!VLGyPuyayw3gs0Xd2sN%L!HN;n3NabTjSu?q20$@ z@hvC+NWPqR$!LriBFv#4)8RT?TH2Ri6p1I+7XNUep+)n0> zADKIQeD=M!ZvBt5pS?a>dh5gI$R=XPMaelmoqQKr?LHLt+=&-v-+4JnW*<%oC_ek< zFF$+z#O%rUV)Y2AILE~O!GR0FBgk?33#om$Y8~w^Skg~9on8;D0DO{ZFKC{ES?c5+ zy!${W3zbVj3(T|dyZ17Dw-$Ex%orPR2mC&kN1uc-bnGLkwwN+&PT9p6r*C?}UT zevU;6bStGgV^2x34X}QeQ#?cSX$x9b`q@_~PW37i-vh5&D(}S+kuOTi=gq3nWOL6LdfRX={In6xJvebamn_AGNzcR?ZMrztOrlLWtlOX2j%OM~PLuQFV!U-1g{ zs_bIg1zcYcc2N&dC)`Dp7Z)7~#T37BK<&kKNA_J=>YHU6)n2AX~USF@Ln z&b|HC?3>2`6=+Ddh;q90^1oktnM71o*lzv$RaEAKV{>OO&Ym3;6+1~>Sz!|w@a*%? z&A#(KiCa}cRR42t*|wvK-LlD6I{Q!OW`FhrF^`>Os)kJea&+#sv#<+@ctL6py1G1C zLI19Yx7kW>fB4eeYsY3!{B-UoV{<=zN34P5OO7mXxDcS44?nnl;RiMVD7FYf!T8nC zc4C69^v%(8o4*l@f_y@Poe{fzVXd}w96Sg%$!8yI{ zU9PX_iaY-of}MrE7f_TSM;IkAKm}P3Coq7fd^lfjjsH-@UbkO9CRTm>=O50#_JdlR z4>$6EzjOk2K{1C7D-z)tR1<+@t>*#aczhlE+%DLn>}$sGw$tx4+LRq=*}#o9wfmJ1}U;BB+-{bKIq z(5(;N6sOEr`btpOmCsaFk}b9JyqvVj${`|9FzEx|xGmoOx zM^mMEd#a$4ddRCFSztYVt}f7nF82XYg}9~!xz7g}hDBN}$a&bOdEB`Kqo@5Dr$dBW zPL7OJP_=U$+4W(&=iQy4U>z`S0xSrg5Y#a|sc^aX?}tWEW!g1-1w8mlBn!nW_R!%x z=oQ7R>~!~f0jPlbfc7~D5jgGiprMG9P7FM1Ucn&tF4l+dLM%4qc?1OV8TG{xDqKeC zQctdbas7-wXOsz4H%#dp1FFWGhV+vI#|Nejg+W8%$f`-h(otE^P(E#_3L2^=4Xci9 z4Ji!)b6rqb2L(e}IU!T!m}>M;U~O9<{qcF0F=)aRqt?u8D1-6G#iz?omA$lVQlEe1 z!LZ!Q?w~_ix#!ep)URl#jYWgX8B@WusXAz?4(ZZE(4Fnaw}&#V^GbQ9W>6E>QR?hz zRY6cyFr&*2W#oi1Z6Rynxt=pUAyeMX;_8sK6#nii)zbn5w5#tBIV(dav!8NW1#T<*kA0?Nj>JfU5OxVHS)1-LoF#d$fK{ zgOFJCZ!%#Iv;Y1~vU@Q(3fd0poL_(tekL~f^sLt<>XeOx)J|&%cAyU(TY(F)? zpa$MTJ@F%o8j$i+r?JPMa(ikb(}7LW*BTkb-^zQusL_1;p(n5Tu~=+9I!1 zcag-!o}3{zNBq&zCm@B`{7PLs{{o~SA5b1vCLo10AEL6kj;biPc)VjOyK}HP zq%%+JtU;YMlvRjDwx8J^GUuEgI5jYBE(@B=MpsRmE5~F(^Qvj{+Ms#uq`3|@b({YG zQ1TQ*SWB5K;moKim?~U*`GLUBCjzR1Dbkp)w`zjy927-WY6CNdwv}V z^Jry5CjE;gTeQ>_GrdKvxRSML1-x8S(GY)4O(U;WH>5MyGHM(2%ym5r{PlDOQpNdN z50&tnkZeW*8iGo03u<8R7HvT_72EZ@<8=g-#o#9{VUFSCXJr>zY3GF9SP{P)!yfdF zdcjXiI~>}4-BDXkY=g8D%c1Zqx^dAZ1W$Hs$A=tA_WXmiM+R===ni|*;M!f(;Aztf!VtPHqsUvr8iVQ_rQ#o7|T z(ty&>aE50n{@=cZTvEBT`^qJ%ce1mlf>V|6?^Sbspyi= zm;1_M374I$mM%I$9sN@+1VmLrP?KmV$GcqwO$2Uq z_3poeTe66*-hN~D>}bRrO56|UWujLXeFebd#~0*1;A3bdrw9?h3o)hd;J1mZM)c5# z-Vz>xh=4T^9fsOWFo4J2*Xi}a(T9U;_qK!GPIMc|s=_ufmUls3D2G~W-j$O+Hj*mm^OfII2yr#vx#1o3d&^>qM*Ehkz@3oP<7~4?n zBvf#cO9WkT(s%^rKJX;L=dqi3RLRgc@JN$$o5*nif^|W&A7Y8yh8I}z`8|ji!77S` zM{7y`1wP!MuXJt9PGUP#$RK zdtuTF86+uMpIAmPCC0k(_d&;6v3XZ?Xu^Bn7AfgpCc;F_Ue2^)~d$50zPlY`Z z5jBoYLe%6*oGbjpI3gs&@bhFbl0zl6gJw0i^qJH8lAyi>JQ~B>LRMQ~d3`W#6LF0E z2RcSX?}&~v=1-^DgK73r`(#@6pdyrJKAn3iceraZt#nXv(_lJz_{GCB#+>2CbKA~r z`(L@Wqpg$1x`3{3UO^em->!+Q{AqJV&|CrDl2C@_bjzuh(>qS>7-<^iCo@)tW%5jY z$e4Ay@>FG*0siKanowTJ=iqzMhc#4Q(Ycm0E$4Qe*%4T_elmB%e408-^B%1ZMq&to1%x&rs(H&l&xwyzcHBK z7_t2#=VVKP3Eiz-;51Pvljheb68KMT23B5et5WR%1}C! znt57#N*l1Q4%9ptSiLon+w}X?=35Y}Kcx>8tQoib+WNkALOHqip}?Al1Nl3DpSo*a zMWyE?cCsO8YE0-Pc$@N@PNb26dtrp&uY!VFbppEpwJrK5_# ziu%hn6Kf`oyQg)#1G?P_AF}CYZhk1o9b@0iMPtkuF)`3jF z4Cey>MK}w(_eD6BO4&ny5mrE@U(VZzLBz6}#a&==le;No%s#tn_x>+axD6PI2UaqdDn+@`-H6_4b z%cYx*@@sh;fxiyCNq#-Az8YSBn?^Th$$x9CH^IxqQo7lqm?$+Pzmaavm48f?0{?L- z-E5VAY|jV&6BRc6iJC@UN2q~NBcUcjvk0{i3avp-E1@>^Hj4SgUaxGaV{WXVTUIG< zRI$j{uu$oZIu??DM=`)J@X+C2{w)0Ng}|=*ueH>`BPWYlYcb1g3*eq7{x?PK1uJlX z8IXfzh)#79lKJf5gi49ta|y{1%T6095cl|6KD6(m$SIa_h~_K2QQXvxJAFccnus?I z%A-ebVpEQkIR@<@rZ4$yuw#oClOE-lVF01JYz@TU?@_M-7?$UoZi6XjqjL{A+7-%F*z|0;gz#mo9-$*_I21|*pr? zu7L#98=Yr$y9nRkI`0}xNPh`mLK%_E*CRtU*3d+7+A zNsC@(F&n=RGwcV_%I`(?2_$=vbR!`~%hSlYkn|vN1F_5Ej2gtMycfuBuoOrMViO=P zZ{3z^)XaLP(Q%xbQ2tU<}Mj=PUkK=zHP96us@7ozPa>hYD6whe&q3oRjQF~V8E_l1_5Rkh=_ z!K%iPedSn9&|V8jzt93*vRA&VJ+FP&aNZDD(==Jy3`oBiAgtP&In)S9KdWHG6f`d# z-5502&R8p_tqnnIL#U*Z^eMEYV)RgONi7UC-vWR?+j18PVCnLzcgxS0zgu~}GEmzx zv2(KQpmuw+}PyehQ3X6#UK z`KD0i+EDfSac8i43!v(yg@CFH3ISCY6auQQTs@ykhGx)c(^z+KS^b1_>dE#%N9W{| zTrl4`ow_fWy6#Q!&lg+BKL?)Ep7&W+*Lr!H9B1tw0o2?4Fh6$g->AvPAiOIfdVoaN4^c%(cAyeKwJP4Fk?YtR^re)a zDo20Hh<$YDq;bWd@()_$3?6<~1a%c5b6zMbe}o;$7}noJ`@Zxp%UFzmRVu+v!ce;O zFThcNYmsJa&;ZvT*9Wp=cTmjL)2h`$)oNVCY|Y<270~!mx_&KvY+b#KxEb^Re|zN2z)BUs}myO^+F_S7Ztln%E4JxGMn^bbPXM) zVgBo-+F<_Y3^4KKV|Pa3tRph|_|h=HnZt0;(HpfXp^lAENA%<{=^BzJt9RiVit7d3 zrd=$5vc4?1f2s80S!Bdj|K2ae%M;>#KlpD9a<@XfL_xq?y!T&PLF9@9xg!_v@UOMm zwaHK$T<&)}$c=BD9|VyJnn&7QK4$~oAQyodfoKGiMZEDb>_x6B;D(9IO1!LQCX+FC zp$?qa;>Ed_AX^76w134Rl1pov6uhC8X8k7LsW$E8h8H)45GkG7FqvKy&=uV>=8tY3 zSNvM{zHZ9c5YRPzaZ5{9+B$Oe;g_4o>Hpq1rLPO9>i#yP&G`Sk2u3ai(3>^%v3j~b zkGjGle??(NK2N<_$y_PPtY61mS*HX3BUSBY8FN*}0)JJ>Ku&vwpmdN!0*9c)yMo{m z=eL0f;EbWa4_{DmkQ?~u#s`0WUl)&SjmPE0qXrW1tKcf)akGbO|NUHhueYm{2OF0X zvf-9WMdUwKJQaA+%n^ALZ#br5dZ(LrR-hlh1OCZ{`1!??f~*CFZzYG$!^WGAkm-5#rngr2%FJ95ZYauT-LULp)GWz(4AM-68%* zM+s(=_qpMpfaD!+w=2$k_%4>{KvIt6VI&0VldT?SV8VYje5G0^{+G%I{tZaQJ;O5t z6t-oWzAIyC`8^#)8~#XX|3nqsrXHE59{H5A&QdG?L{)xDt@xB$_bIjIPt?YnYRjN` zT5SocEyHUj)x}2?e^wiZ?V}q4Wwlf4x+98tjhxOHdTgG8$Gl!a+lI5}DR{tL7@6vY zobTrhK0azak~2xKxvQYEa)&p+)c=RP!Vz$dKQWc}PNK1n+OP~0!mV^SO>5wv zYQhXMHx0%xiwtxo3pzts72F~Xs|l;2GP3Sz3ERdz2>C`=g(-ZFdB?Z>{LnqZhj+{L zbj>Ierrm literal 0 HcmV?d00001 diff --git a/backend/app/api/endpoints/documents.py b/backend/app/api/endpoints/documents.py index ba27bff..a0bd91c 100644 --- a/backend/app/api/endpoints/documents.py +++ b/backend/app/api/endpoints/documents.py @@ -2,17 +2,20 @@ 文档管理 API 接口 支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引 +集成 Excel 存储和 AI 生成字段描述 """ +import logging import uuid -from datetime import datetime from typing import List, Optional from fastapi import APIRouter, UploadFile, File, HTTPException, Query, BackgroundTasks from pydantic import BaseModel from app.services.file_service import file_service -from app.core.database import mongodb, mysql_db +from app.core.database import mongodb, redis_db from app.services.rag_service import rag_service +from app.services.table_rag_service import table_rag_service +from app.services.excel_storage_service import excel_storage_service from app.core.document_parser import ParserFactory, ParseResult logger = logging.getLogger(__name__) @@ -31,7 +34,7 @@ class UploadResponse(BaseModel): class TaskStatusResponse(BaseModel): task_id: str - status: str # pending, processing, success, failure + status: str progress: int = 0 message: Optional[str] = None result: Optional[dict] = None @@ -44,7 +47,6 @@ class TaskStatusResponse(BaseModel): async def upload_document( background_tasks: BackgroundTasks, file: UploadFile = File(...), - doc_type: Optional[str] = Query(None, description="文档类型: docx/xlsx/md/txt"), parse_all_sheets: bool = Query(False, description="是否解析所有工作表(仅Excel)"), sheet_name: Optional[str] = Query(None, description="指定工作表(仅Excel)"), header_row: int = Query(0, description="表头行号(仅Excel)") @@ -56,13 +58,15 @@ async def upload_document( 1. 保存到本地存储 2. 解析内容 3. 存入 MongoDB (原始内容) - 4. 如果是 Excel,存入 MySQL (结构化数据) - 5. 建立 RAG 索引 + 4. 如果是 Excel: + - 存入 MySQL (结构化数据) + - AI 生成字段描述 + - 建立 RAG 索引 + 5. 建立 RAG 索引 (非结构化文档) """ if not file.filename: raise HTTPException(status_code=400, detail="文件名为空") - # 根据扩展名确定文档类型 file_ext = file.filename.split('.')[-1].lower() if file_ext not in ['docx', 'xlsx', 'xls', 'md', 'txt']: raise HTTPException( @@ -70,21 +74,16 @@ async def upload_document( detail=f"不支持的文件类型: {file_ext},仅支持 docx/xlsx/xls/md/txt" ) - # 生成任务ID task_id = str(uuid.uuid4()) try: - # 读取文件内容 content = await file.read() - - # 保存文件 saved_path = file_service.save_uploaded_file( content, file.filename, subfolder=file_ext ) - # 后台处理文档 background_tasks.add_task( process_document, task_id=task_id, @@ -114,13 +113,8 @@ async def upload_document( async def upload_documents( background_tasks: BackgroundTasks, files: List[UploadFile] = File(...), - doc_type: Optional[str] = Query(None, description="文档类型") ): - """ - 批量上传文档 - - 所有文档会异步处理,处理完成后可通过 task_id 查询状态 - """ + """批量上传文档""" if not files: raise HTTPException(status_code=400, detail="没有上传文件") @@ -131,25 +125,15 @@ async def upload_documents( for file in files: if not file.filename: continue - content = await file.read() - saved_path = file_service.save_uploaded_file( - content, - file.filename, - subfolder="batch" - ) + saved_path = file_service.save_uploaded_file(content, file.filename, subfolder="batch") saved_paths.append({ "path": saved_path, "filename": file.filename, "ext": file.filename.split('.')[-1].lower() }) - # 后台处理所有文档 - background_tasks.add_task( - process_documents_batch, - task_id=task_id, - files=saved_paths - ) + background_tasks.add_task(process_documents_batch, task_id=task_id, files=saved_paths) return UploadResponse( task_id=task_id, @@ -173,13 +157,10 @@ async def process_document( parse_options: dict ): """处理单个文档""" - from app.core.database import redis_db - try: - # 更新状态: 处理中 + # 状态: 解析中 await redis_db.set_task_status( - task_id, - status="processing", + task_id, status="processing", meta={"progress": 10, "message": "正在解析文档"} ) @@ -190,11 +171,10 @@ async def process_document( if not result.success: raise Exception(result.error or "解析失败") - # 更新状态: 存储数据 + # 状态: 存储中 await redis_db.set_task_status( - task_id, - status="processing", - meta={"progress": 40, "message": "正在存储数据"} + task_id, status="processing", + meta={"progress": 30, "message": "正在存储数据"} ) # 存储到 MongoDB @@ -209,24 +189,53 @@ async def process_document( structured_data=result.data.get("structured_data") ) - # 如果是 Excel,存储到 MySQL + # 如果是 Excel,存储到 MySQL + AI生成描述 + RAG索引 if doc_type in ["xlsx", "xls"]: - await store_excel_to_mysql(file_path, original_filename, result) + await redis_db.set_task_status( + task_id, status="processing", + meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"} + ) - # 更新状态: 建立 RAG 索引 + # 使用 TableRAG 服务完成建表和RAG索引 + rag_result = await table_rag_service.build_table_rag_index( + file_path=file_path, + filename=original_filename, + sheet_name=parse_options.get("sheet_name"), + header_row=parse_options.get("header_row", 0) + ) + + if rag_result.get("success"): + logger.info(f"RAG索引构建成功: {original_filename}") + else: + logger.warning(f"RAG索引构建失败: {rag_result.get('error')}") + + else: + # 非结构化文档 + await redis_db.set_task_status( + task_id, status="processing", + meta={"progress": 60, "message": "正在建立索引"} + ) + + # 如果文档中有表格数据,提取并存储到 MySQL + RAG + structured_data = result.data.get("structured_data", {}) + tables = structured_data.get("tables", []) + + if tables: + # 对每个表格建立 MySQL 表和 RAG 索引 + for table_info in tables: + await table_rag_service.index_document_table( + doc_id=doc_id, + filename=original_filename, + table_data=table_info, + source_doc_type=doc_type + ) + + # 同时对文档内容建立 RAG 索引 + await index_document_to_rag(doc_id, original_filename, result, doc_type) + + # 完成 await redis_db.set_task_status( - task_id, - status="processing", - meta={"progress": 70, "message": "正在建立索引"} - ) - - # 建立 RAG 索引 - await index_document_to_rag(doc_id, original_filename, result, doc_type) - - # 更新状态: 完成 - await redis_db.set_task_status( - task_id, - status="success", + task_id, status="success", meta={ "progress": 100, "message": "处理完成", @@ -244,20 +253,16 @@ async def process_document( except Exception as e: logger.error(f"文档处理失败: {str(e)}") await redis_db.set_task_status( - task_id, - status="failure", + task_id, status="failure", meta={"error": str(e)} ) async def process_documents_batch(task_id: str, files: List[dict]): """批量处理文档""" - from app.core.database import redis_db - try: await redis_db.set_task_status( - task_id, - status="processing", + task_id, status="processing", meta={"progress": 0, "message": "开始批量处理"} ) @@ -278,6 +283,29 @@ async def process_documents_batch(task_id: str, files: List[dict]): }, structured_data=result.data.get("structured_data") ) + + # Excel 处理 + if file_info["ext"] in ["xlsx", "xls"]: + await table_rag_service.build_table_rag_index( + file_path=file_info["path"], + filename=file_info["filename"] + ) + else: + # 非结构化文档:处理其中的表格 + 内容索引 + structured_data = result.data.get("structured_data", {}) + tables = structured_data.get("tables", []) + + if tables: + for table_info in tables: + await table_rag_service.index_document_table( + doc_id=doc_id, + filename=file_info["filename"], + table_data=table_info, + source_doc_type=file_info["ext"] + ) + + await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"]) + results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True}) else: results.append({"filename": file_info["filename"], "success": False, "error": result.error}) @@ -285,61 +313,38 @@ async def process_documents_batch(task_id: str, files: List[dict]): except Exception as e: results.append({"filename": file_info["filename"], "success": False, "error": str(e)}) - # 更新进度 progress = int((i + 1) / len(files) * 100) await redis_db.set_task_status( - task_id, - status="processing", + task_id, status="processing", meta={"progress": progress, "message": f"已处理 {i+1}/{len(files)}"} ) await redis_db.set_task_status( - task_id, - status="success", + task_id, status="success", meta={"progress": 100, "message": "批量处理完成", "results": results} ) except Exception as e: logger.error(f"批量处理失败: {str(e)}") await redis_db.set_task_status( - task_id, - status="failure", + task_id, status="failure", meta={"error": str(e)} ) -async def store_excel_to_mysql(file_path: str, filename: str, result: ParseResult): - """将 Excel 数据存储到 MySQL""" - # TODO: 实现 Excel 数据到 MySQL 的转换和存储 - # 需要根据表头动态创建表结构 - pass - - async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str): - """将文档索引到 RAG""" + """将非结构化文档索引到 RAG""" try: - if doc_type in ["xlsx", "xls"]: - # Excel 文件: 索引字段信息 - columns = result.metadata.get("columns", []) - for col in columns: - rag_service.index_field( - table_name=filename, - field_name=col, - field_description=f"Excel表格 {filename} 的列 {col}", - sample_values=None - ) - else: - # 其他文档: 索引文档内容 - content = result.data.get("content", "") - if content: - rag_service.index_document_content( - doc_id=doc_id, - content=content[:5000], # 限制长度 - metadata={ - "filename": filename, - "doc_type": doc_type - } - ) + content = result.data.get("content", "") + if content: + rag_service.index_document_content( + doc_id=doc_id, + content=content[:5000], + metadata={ + "filename": filename, + "doc_type": doc_type + } + ) except Exception as e: logger.warning(f"RAG 索引失败: {str(e)}") @@ -365,7 +370,3 @@ async def parse_uploaded_document( except Exception as e: logger.error(f"解析文档失败: {str(e)}") raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}") - - -# 需要添加 import -import logging diff --git a/backend/app/api/endpoints/templates.py b/backend/app/api/endpoints/templates.py index 2248b1c..572d56e 100644 --- a/backend/app/api/endpoints/templates.py +++ b/backend/app/api/endpoints/templates.py @@ -4,6 +4,7 @@ 提供模板上传、解析和填写功能 """ import io +import logging from typing import List, Optional from fastapi import APIRouter, File, HTTPException, Query, UploadFile @@ -222,7 +223,3 @@ async def export_filled_template( except Exception as e: logger.error(f"导出失败: {str(e)}") raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}") - - -# ==================== 需要添加的 import ==================== -import logging diff --git a/backend/app/config.py b/backend/app/config.py index dc80837..5e0e871 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -34,7 +34,7 @@ class Settings(BaseSettings): UPLOAD_DIR: str = "data/uploads" # ==================== RAG/向量数据库配置 ==================== - CHROMADB_PERSIST_DIR: str = "data/chromadb" + FAISS_INDEX_DIR: str = "data/faiss" # 允许 Pydantic 从 .env 文件读取 model_config = SettingsConfigDict( diff --git a/backend/app/core/database/__pycache__/__init__.cpython-312.pyc b/backend/app/core/database/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0c9f8627b2b5feeae8036e2563119733ab657da GIT binary patch literal 577 zcmZvaJxIeq7>4hXw3kK&aS>f~&C*ds#i0~&u>RET2pnnekZ7){Nu^{{5I287q@XA$ z2)c-~gDyHbxn^-tQ{DU-1SjvRML}$Wxnh;!L_8Y{1#ig8An(?w3QRNwR zg5LV}^6x3*Nrp7l2sH+>nMmhhFz*sth_s9yFjEmvUyKNGS2w%Wn9Z$%U7TYjY6-X{ zlUmKB79mc_sSuJbhadV7_vnJlhW!o!72f~^Jwl#U2;qyOLG9fDaOegM-h#ec(EqCG LEy@z|+lA*7Rp7Fm literal 0 HcmV?d00001 diff --git a/backend/app/core/database/__pycache__/mysql.cpython-312.pyc b/backend/app/core/database/__pycache__/mysql.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f1ebba1f18d01fce1d8f8389d10dbc15b937b3f GIT binary patch literal 9688 zcmcIqeRLF6mapoM>aXggzw$*eIfx9O@t$Nq>_-9i|c ziGWTJ#)O3-K~z9x2+LP=TxGyflRbORoU{L=O?wjSFlXFdx=Hq*jfOoij_2&&`>Lzb z30X$}m`du_tNZS~?^V_N-Fx4A{cl#QnSx_;!M_GyETpLa#2e+%XA0xhG(~k$9Ob1r zn$v`6FAZ01NE6n2wKVc{AzfJS)rT1`6E=7aVWZa=HhE2?P9HLdEnZ8Q^|E2BHz%Cy z%?;bUwy@o64?Db$u+!@#a!kk-&hzGl^S$}u0&fA~8A65OB5zT6hIaur!nFa{p2eL_xDMbtv$%7hKNS~bbCh>(tH$k$Z-!fQee0GVZ7NOwx;Oo+ z%c*NeM?QTk{o2`Em(Seld?np;CiP~Q*_`g^y!pw=Ti1`K-#U;wd|Z~j_4yRtEWcx|K2=`jWoDv$>|qcBY{9P661Hp!v2VVJ1@BPlC~z&D&^E* z-Q#?O7yPlPAn9s@ftaM*6cl5U@u`+rFdFfP+(wC67Y)V35y@EVkMWJcFfZxWN8({g zUl)q{W0GM57zeWwvn3`3BikiiBeW%10|F27K0dNN7~$Poi6zZ#;YAS)OAgX8pD5pk z{m+8|XGTS?$|!22+*&>m@;ya zYTO35T@+8jD5bm_j`nKd8)zjRT8$S#C47dOP+$iiZY@o{)0^&oEp_VJt@o}sxOI}z z=ZpBmyw4|@eZFv%i-)kx`g||M{h>^a!ROkAD}LEGyVsa>VRv>AwAJb}DicSI)p06<73E zpTopyCOwBTCa4$=a&o|PxRmn5mGs@LI5&6$>T|VHZc9VsC|VjtML>j(5A29a)|O~A zzaZr{Dr6>r#BNz$Be7@bb%vi5dIJ2Q%JO!kUx;gWq8?0-HElIE? zH|#}SB}YnzXOs-*7Y-K{-?lIhnA>$pW6^B~HD|@JJ#WZfp0JnSaLyWjpnSMs=5RjB z%po%6AT!gg+8Sc#B-lBpe{t0?$Uc^BxIi{MlPDE}((J{=a9P4$mN9A2&4$`9jAg{z z+ylb>(DCtC;YpvQXmB?CE zvb163ECE_o0JR@tL3R&lY68&BxfAx35~oot(^M+d+c9y_1tU#S$+Uh~fG1gpq~`@8Dge44 zMUy&6Um_CCi#Uo(OEZ`<5JYj?cIPUP$}pY)De)MP5$bLmHKt=sj^Tphp@Mmdf_Xy) zRf&SCp@K!pfDp9#=Fn{%sZS{a{^&Kr$xEKU)vW|`? z4?a0ixbh2j)tDZbcZY4O?~Y;hU8sluM8TrJ+-4|S(KnRFK!moYt5ekc9{I<}8qr=<^+c#5dE>6$gnJ6b6HW}!!??Q_x|is^Iduv};X zc3%bU(dhari?F;J_4?M(Q0}K_km=WuvX1fO8~V+($IA9w3|Mv;P^QSVj?(s*7kO4{ z`&a6r{4bOiEKEW%zSI41j=v8?M)42Fu!=8@f-kV422U`YIqv5N0CN-gHYpkt*LyIx zl#F(599?nh<=>8WU7C_WH^fVzYU=PNxpnHJ_hl8gULLjvE-1D{MP672D#Bw(YLKh| zlI_X$fe=XTwkl(e++q$ddVr7-_f?K#C}&n8XVy^8+(gdY!5sIH(LG>v4;$GbV`0Kr zIAknM7)z7J@*BpS4$DEyfNSX&#z$amndW~hqPN*sQd2;mrfc$aXC8r)#0Y#WE<_q8 zeK>TGT1X?t7pU$RUOJSg9*U#gOM`o+FX_O|(lnIr_c7EDsya;6ZJy?{DArXCbqEm0 z2o5o(4g_#>#iV65Bbw;zBs-IwCr$066bA<`wyNU$^aL%CaDwiaR`tn{(Zq|P-I3Qm zN_8AheY`Jy>8v_%Zhrjw$dxx6U{ovx5lB@?Vj|$T1(y~Lg)n0mHo&c0BReNUCj5(- z(J8*kvYn49Q=8pRWvpZ|p8;X<=Rjm1#ZbY53%J5wtUP?IcIb&JhJfULqlv?f-M`fP}Yh+-^eW_z6ZpI9OVf|`Jkh6h^XNLkDO+&NcO{+uoQPV_?;_Sroy`dDF&mAXjRSag?kP zsmXwnEGX%SZjbJMBY&@ckG_`bU;>)$R6z6hkb3TAvgQj+vzkLI4^2@@>qj()i*%s1 z7qnsiFscy8p( z;nZuVAX_LUs}N)z!K@0@2;0SK^$vB&*R0MG9=UQU)p2%`jQpvqOEaqy9s2-c6E-&! ze>TOeEJ{{5-bl4~K^~%7Y;YTdr=Zu8f#1ak;xS&x$2%P^YDmT=J{H*FhkQcE;R1%y z7l!#X%tb&6^9SK+BtJsZfLh>9#dru7I)vy>QPLvC@j>2(0$^*V9hlt1$laZ%KzI_H z2Q9H3hz#Nq%2hbztVlR3hMcpL&e=T$!^P#N)}B~qOP?^)($!=+iSmkc-f$H`FoW;OODlcJ5S9#G4l)8oZiZWbNL;E*0kad z1>|ca1J2oZhx6*c{SW8zF|4@@t-*i7Ir}?t2Xtm%o@Wu=MSE;aw-!n#7kG-beFoa& zVET-OST3FEaWQ?XG+184f(RnEh(W|c*+B!p-=#-dL?fq|@hmj-m(iX|w!hqfWw!zN zk``7dCSe)hBEj-c|AtteK=AiMuC`glKJ+egOo}z&jg*N8#3tQzQwxEeo*%IQf0}8> z55V#kK=5ifEk78=AHl<@0J#UnG9*jqZD`oCeskkPKYqHlrg8m4wd*&nZ(Q%50OGyi zhL0*k`#zwSt5V+!)Wp5WpjLa^;ARM%g)Q(5LL-t*KqM0^48#DBK@NlM&OkICi9skP zAl`~nDu@LVK+(Yy*vhl)Tm_wif2R+gV|T$+*k&qPK~FpgrRS`S9F@LvxpZK6iQNG64VQ&Wfu)`PBD2AA+`?M%32R2-|vE#_hVh z!)*E8F+Fm!k=q2l0pu=Jk-LySS>Y+veuBXLv;azd`4D57zJj?}2HY+rxXmZHg)-o_ z5O7NX2Z9BVd>o z@D-Ca650Q1;rj>}vjxo61-UXlO1|F_WRILAo0>-#jsTwPx7A*SO;Lxx5D%UZl zhwB(a*^9be-GS3fx_6u?N!aF%VKpQ>=A3q}b4j}0>(}!PrzJ~@_%7k_8juU8k<(uQ zEjΠS3jz7cyzG*EEfc>7rRlPrPPg*}_?}c&w_!s&=he^;V+8a5-G=MVqp`=gm>M z_AG90hU>`Ew-KJ5b7t`z8J;VP=WNY$=S$YwXdoWuBe6#Rwh$jj{HBlpGJWQq)XV$j zR2zbO>B7>?-o;1nkDPla{Z^+egL@jMuBG3&c=OXU>6flZxv-*6jQL<`9M;bp&ub)0 zix3R^g;wA5e5+)|mBJvbH1OEu=>I}%b#eSSxRTNNtMgikiARDE$R%SW9txom$$*JH zNXIcZOLu>A>)mT0dh1;8d0GeoD~>ZA&`)=CjC|S~gpDLh7Jc(<`Wklh=JgK+U{a(Z z$X4L7pkD@#Z}LNKA={Aqa2TrME^LNqrQSW4>g|}9y3{rD>OS}XVEdU>nXV5G zcq~tK0f-s_Nhh4*2_WbK)(yL6x7Gi7jzqYs*?6cp#io0(ncBT*jGH& z1HB*~=1fSzM68dS&`5j1d`U^QQH0D8`pB|+d#B+ROWlMLPkq4S+bolVhrl zb7k@5m}+kHi_dT7VV9VT&xeueOt3tDn*27hb}RDA-`&47>K zt1galVjb+A3$3!khFT#AECCm(2{>eS7Stp|3v5`!&Q>K*ByPX$8}Xd@c7>)bDn8?9D>%7 z7rVC|-Q5Fms~X~#)p?|@`{AP-drVi?B&{3U>JWRC1M{kr&Xt+iI579oq@y~6yq=Pz zYehEjNI>nL2RJ4Q_ryNzt9u@q5J(2xpTSLCbLIymq2M+$Esz*^jZh&^0LUBxaVt*+ zaC#t_VgBWpe@P+xMSOPxdVI+Mt4#R&5dlX4$yvyUA|FHEa?PJmu7VYwDzZBpjLP#6 z2Wp|imvXYiqC!}}Bun0y&8`6&ADw{41-{nofj{x!%N0k?56u82p0jA?0_{#v7>b>n7=&ix}b|4-Da zFQ`>_bS7GJhXOK=C1{E;e4QM0}>&1f$eGR;VsW^}I@G*z}Sw;2s> zYF~Ssg3BFqkfw1?NiMgyYOQp2XYFkYF1JfrxeS-vCCGOX{@0Zh>pb-6OY$#9{ts6b B{Vf0h literal 0 HcmV?d00001 diff --git a/backend/app/core/document_parser/__pycache__/__init__.cpython-312.pyc b/backend/app/core/document_parser/__pycache__/__init__.cpython-312.pyc index b60eb18aff1979a46a48c5bab634bae8446b4a42..25edcb68fe9e86057a81e00569f6bb69335328aa 100644 GIT binary patch literal 2873 zcmaJ@U2GIp6u$Gbv;Vu@(!~NTl#~hs+CVTy2*Jc(P((u2$htZicBW<9+1+|)R@!Ds zia~87&^ASue~Lh;A@L_B0z$%*PrkJ7Lt!EhM0N`=x=@V?Po8^cX1gWeO?&UT=bU@* z-0ytn-2F8e^dV@`CBGzZ2?+f`CQjpRVI~Jc9w{h>6h>h+CdI~BhRB@8rT7>RGOxHa zS4xNpDR<0G=Yr-*d1GEGyER|RAM;b$qXkmISdhwIEtCq!!VF^3>qznKLyG?*Yn>Xa z=|D>0YbF-y4T_=c3!wFt4u4TPbN23qv!zqRrSUVxlli7+n@WeiEA8J~Ja=sR!U1r& zbM>dv*tO!dq3L7$EKlD(vZKhD{5ILxMaEZ>3F96^l&tA{N!HbEGS*dW`suh`)w7xb ze!llLy~i?<*(;gE>@4(_jJuS~?sUcOeW%B;CJAz$)7-W&a|47tQc;WnAlY{Tz8J?M zmA_~^JR-ZQ^uiNlx1(3s6GF_RdR3q5Z)X?4j%q;RHjpu>@TyO7fu2`{eU;NmPq^82m*kuY>HL^IJu*pOr26`h7{%8J#E{_aC0NQfQ zMO*Hy_P4E_E?cM?9NhP?In+t^eWZ1IA5p+G$2zC%;c}e*oZ)wTp0G#IvCodqxs{!G zBkWieCdVE}3Y+7Pqm#Vj@hOwz+K|X)mw{h3{H4Kfia&f+96oUO;HBcw;R^QIh2VJS z@&PNYAc?ATYJF3-4$LHyH0y|&CfoWZ(Sr%Nrl4or+mk(}OUvw5F%A(O*P$9FuO)Tp zyVPFY^t~r*S@l(nGgx$)Ttd?^xrfQA5KzI2r|H$R-Q5{%sESTP&_vq9H#e=1zYST3 zGo5O}h`$CA*SD*(($=dRs+@}NloMTQT8Ycu-SI>Qt8rjXHl?NwnZp-1PEbf-heKTfHD>Og2HaIMBW3XCC5FE z&N-%?Ndg(qF$WnS29vFY#nV?$-?`jxaiDbYd~xWk$l^vYm@XO&k;nBgvw|da?Ur$e zjtNn8@q5ME3DNC zq7;snRRi2mU@$P;mftb5WAuaHJWJ{I6V)NETzV5Mn^V2e`w~TC=s%x37Ybn$%FN9Z2cvetUwu*I7^QE+$QYFdsNm44KWTEa$W;><}PO&`4ej)Df2e2@uYmo*K-{w&O9qw>(R!Q#nA z{5%*4b#=nIGRH6s+>+t{5fHQB7J6zLiMLQQ{cE{}mi{ZOV8r_fx(Dwu9zVO+O*j1y D1n%{Q delta 221 zcmdlf_M1ulG%qg~0}#xAye)G*kbVr}zyK4J@mXV{x++IHQxt0@n_Ma8K_AYMdHaRr1|#0*qW!~!JzG}&&k$H%ASC&$O%Vo3t32g}{!t^g{I zF97QWD+H-Z%*lz5U&-(pWbfn+EWVb%IBatBQ%ZAE?TUDSVjydaWq`y7W=2NFk4#LA bEMM6e7-jD=C_ms9pCNOBORbT;2;?CE2-rAV diff --git a/backend/app/core/document_parser/__pycache__/docx_parser.cpython-312.pyc b/backend/app/core/document_parser/__pycache__/docx_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c1d8b0cb342b3ff603900da6d19de30dcf37977 GIT binary patch literal 5713 zcmb7IeRLC57JrkONiz9pO9`PZG%bZ~idw{7_rpC0&;HYHLE@b5Sg)c?HHNi&e&yOEwKtHS++z4u5#j)XD5yvt#sFMQGX{}d3E6E z#Oo>L;GpFi7TCxVGPTJkwXu{;u8-`JsSN?YBvTIrL`gP3*e(U45nl)znKwlJu`nNz zuHo)kgKYVcPvkfG1d$hHtC~N|i?NX8$7=P@oFS&#fOr;M#3OJG@)$V6LncuRIBSpD zLvbWea};lEp*VUE0_BPp0`%u zc@26C&H$y1{}I}SSaRcTTJ^VV#a{Qet8M}p(_*Y0{AKof1CfB_^~w%lv_}Pr=e&HU z#79K%wu5UIr);B-c5!oa}gS=mAz9$gk z#fNzxw|SQ+@xE~L4xfJ~AK{vP?d{F}sK7UaG3p#<48mpIxVrXT!VFNmduGmcb-J0b z?vYpt$yZ%y+-%=x8a0<(FqfsP7xgzKsSD>^#H?eQhVmI2p`(6xwD!eGbGTN#A?wj*Anop+kn_3Bg_*O<#W6KB@<-ZU~<-5k9W)&2 cVt1YIIu`AYntINLr+qgG+QOmvaTuGQAWp((T1NSf>e; zXC3k1+viRkzhNul3KK;?E83jNJBSX{Np3|Q1X~pQE%axp%zRo1s>};YmTcX6p;fHU zmastfFs;Y2{PMYXufBIm`Td@o+SJtS?C@d9N@ef6m(RVT^gKQB@=#9e!|EygEya03 zyFP;puAF;u;`P3)>U&~1Bxt=}9)4aqaX{%CtgnQSWD;Y3KQD?h%?moj zYNGq8E3fX+WUib(r@a1LeWiepWe9&($+rU{Lm1VXYT)PnT_UQvP@61h;6gqR<~8U$_5C;3=Xn1lHg zF39FEFZnQ~`sBM)t-Nvi>X|p|D>t&G0*-L9k?#bsB??xxAjVo+0-Z7)igti!ao`m& zh*E8kDG7W4AVLxX?XnTVIz)HyO9XNL43NqU4ic z_9UfW{c*It~tAnkBn za#W8xs{6Ttz>&a3$I?j}mEJOGL37HJ_m9u5?)MJ_j|4xNyKKKXMWj}x%PUfr^qk7W z^?miJyV4EoKMcGd7^`SJ8yMpHsR8B)Gf;S>@M7h%;XtZ!q@ppWzwM%93Fw}?@C!3? zxYCY_OOA!3j)m!4-08}i@oM*A*-`JO4UNai!TW|Df9uB=-1X;u*BRRF{1VZ#W=|HP z^5v5*R9b$RImqvrAPgMypl~Z5XZ^tGaW{)qt6`W3IZP8KbV{ z*Jm3<(Jt?K<6=m<+*IqiQ^t`E@NacdfcS9K7*lP3w zVda^<%E_}?rFd`ODWH75chKK%n*V7x}h^NK&P(Tt^ z0f(iv^9)apun7{GfH;($AQ4ZBdgN%Q%`AK>`TUZ1C%-EL8JOOG%Ks#_6 zO_m^b@hQt&mZ4P_+^feN_l~+(_b(fM=))~%xBPBv>Y?5(`?tKjb;P|o$&5JeO&69N za2;4YQs^EryTAT|M6((R5noMSnzf#!ep3v|P4_7iRH=*T*Ff~507}61nlo@D2iIx* zuFz*pker!gLNL|ezaRj&*JbcM4L1=gn7w6z>dY56!hHq@bodgcxCxN;96;9Cif?^f z4oe&2^c_&!y%eyx0bkK$A44%aQ_9extEXPp7L|%Y_9)%2C?|R+PQH^{SJ!@-R8E~$ zQb{G%{dw0@S59^*FFbSQ#J-8XeadU^D<@x_=>I_ZphrU=@aB<|{GV22)|hF*QpUDd z_h?o@s_Y%kil~na%D{_C>iKQ|{NUC4?agA1yLodh8-sm{uDtZl)l-M^D!3N8$_E7>@bZ zp51@#S2y)ii|gx}xmtED8(R)dCVJnT=sTR#0e!DbY*WsjS+aOL(90GrPi&j$OXf-} zYmDJ*v^F%A)CQxHn!Q>B)jEx<+M$wKe=7RhDB!C%U_gVw%XC2E!vM8J-Y59mWV%&| z#@bnPZd=O6mH;2(MA-~51At9L6t;l4Y*E?jyNGPcRLOMMC;8h%43pGt3yHRKOZ9!3 z2iP>dyBAEed%x3aFK85ZLZa^AVn@;l5pZ^0+IdIXxh(Bmm3H2icHV#8ZgNtWOpVFw z1sLY_s_!?#dJt5xr0^h6G8q5>i>FZd2~O~S7ATxN28nFXyeVNg=Jm3KfZq^A9D~-W zE)?(^hME-OipL=NY7hF9a^0Xvnw_+4C+LFDW;!QnOn`GNv!)nKUjLcFPP2paAu!YI zuwTv2uN>L5`Qz17m^JBg(au5OLpH<>ulNHweEaFhzi?4?fZ8QnEb!2wY{Gyv5NTCs zye$;ik-^QuuSVU-C;fk^`182HP&G-{Fe

I65e}cjZM5A^@6STrZ zv=fjkJ&ImsyOO|dq=)RK+NoYuyQ){+uI|;eYs7YiN87HYDO%;x_3GR8y@qxJffR@# zx=Z9qQaNKhM@Y5yYz5kYX!T)4YZ(1GWui6bfWofz;hwt9F1Cm68}L?6zWc`HnX|V) zIy?FP(8NescL(#f*$IJs-o@{;lY(-U_kcjI?Q-)1+1kbNg66q?zN^pc^4!6**%g9r zwTok(cd;D93VJc$!f-wh@5V#Pk2X=b_g_$W1Ej$qkOFE~&_ug3h&n(nhc)datz;;g zWKe)fGpJ>n7itGl#C`(fkg3k;Kl$&M>*qzuSU_l6P4F0>f8j^R2 z6NragdX5kQ^h;2nbBVg74xNesaw=pVS_!qp<|1?e*~vD)vaXl*YwLOw`zvJ2C1=05 zL$IS1=j(^9(m0)6-Y(wh6!M{=zmMe^+R5za87~Lh%L#g^gVXEkWnh;{j`4H|TBp

C*GzW$(fV~gXt0q^r{UpM3C9qYS147Y`G(OVC2Jmczh>~*<&7%%N`_4hm6 zeJtak``o@>#>*!bB=tDg(;yq>O<#@yPqaj3H&jY=8`imwuE$+Wr1%p=d7UyJxGd87b=jmdenY!aH#AQ>RLvt@cJm)~!YqBZ!R z{5}NYb7ih>ydg85PDE>eO`O*y<);d$x(jA7LG=)oDoxdz&_wzI$$J4!K0@JCUq68OF)n$MA3;I_s2vFe^>> zjR8Y|3aA5`fHt5D=+oAbl#N8Avrj4MoKxCB*7;mn=eHsosz)3#pdio=M8Llx;%Lu) zK0J~?Wii}}9pJVH zjjtKECh`Gd07>h!=lMQ``E-?uNAla1vb{_=8(osVitXfDk~Q2ZX%Jg5^Y@oOy?yDz z#21IB@zm7x47hO5%87sc^!v;2P8|D($#){DQMZVebjHA~VFaDu9<*VV~zcDx=JxMZt$FzP)g18Eau+6R% zRN^?`+fzfpQY8u5wSwB+=LP@I3wqWy0KKG&VISY?k!vhIAM8Aw;FBR3#MgC7Z_8rC zC1i6v>vMy%1=p%;bQ%Rx#4$t%y8yQ;dTAD0q!}B+ zHejNz-{b1?I(huEO}K$zIPByTW<$a zJTKf6D{cusGo?Z0vrcas*%XIt=g7|S^15hwU95ZouJ0JxaS~bOMRFrnNdgry*{wzN;py3-9I`3r$lNRARkvGvQ4$Wf_0kPJnEqxr_dK|1|>T6oY8B z#v2CHvDPE4W2K8fw~e|k&mJ|86|cT-SR-%Z!ryYE^`E>N**8|WLfph3@&5mHih{lm zxJ95sSLYcwD$vy>xf`04|87`U4VAB&s0}OBU$tnUd`+R+}+i-gY~rw z2kj1SZgt&Udljnz!W~R~KxbYI8%#k3;UGkXxatD4Oo^V&u9mDG-gR~dPO!5Ps9qQE-p9U# z%R0#s;(0l|UkOVl8Zve{H25tu%vjQJO9uZn;Ngxz@;drXlN-;?3##z(H=o#feCOD# zrpUk*#aD)_hHnct2iM)mFB+y!>qqpVRVR(H{5ipO@nXyA(vi~83nvq{AXYp-_)Hwm z$L1rOffzSi6y~Ah3TqJDKH8d8PVA{9<^2Bg*;}DsOuTm@p_!&V zd-ha225RgMZ*^VG3OiIq)nvCr7j_#YfF?B_m$%ag=u056Pq(WiU1|rapj!LfwpFdI ztON7!!pUyjNi$RNFu)Pi;BY~ey{ro}(l}ut;qYOY=0e#O(Bii}O7qf?1=c*y2Z^M9 zh323h)Gx0(sE!vDVI2$=h37;XE-k*e_}hYIpm0THCk`Gz7^;8!wci)b4;KbE#`DZ4 zmKYWXq>=P)_NXrsQEI z!>=ML*-A*KD_MZZY)@r-GDy+{R-;sW6G&Rf9xxR{1lt4ZNa4Ueld2_E1m6%m*51NM z^}v%sjZ*3gJQcu$-(|*=;=y+xC1@==rgnqJa4U9!6&zFt)Q?-;amCSm5)EpAkD@7T zQy=xd?4RIBX0Ye98cRXyi)A54#Ozn#wX{4|cJToo&7;Pb`+%=H0BhF4h`PTiSA9VL zc)7BmZm04Ld)eW#@r8$>8^Jw4b&wsauO= z>>WtnZqV9ONj)XdAsy@t#1&YRp#=N)IGtt{_^tr^P&Y`%tr%1fq?PPCp8>W~GF`XN zgeOCvOLmKW5t<(m0m+U2FFX&jcRK<*9h_rcOJLVrN1#eFUZkMj2Qdu8auCe)4RC@M z(nMTmaQAt9yu#1dj@Umta zJTaKVu`7fpL>K!qEEvouAWq>oU8e^ylI$-+p5F4XPQ4+er9>h);O$* zS5}W#HbyHOBWtc#HjlnMzN{^}tnC_iec9GOl+B7STleMe&vsv16I=E?3_iB?#ptq^ zVz%v&Mz)34e)Pw~MN z7NhJONuEc&Q%glK@|Pd#k)@6J%Mbae>Scn%N_}-<(fUI2ch;iyrK)K#1%n9DgKV6E z$l)efc}np}Tn<1b&8k6=0FXvWOWq=RYGLfe186uCTuFkEN6^)>x1u%Ez{X;*C>a41 ztznFaM!-;=BI&t!gTSwlIOp0B>;=5|L5)!}J zv9o&TZpW@&bM1~@j%r-CF_!G?yhlQ190aiiPU-c-61XiSnOWYz|15WNNB$6E{Ef1Tb zxwX@9`qI(PZwuA~JTjPu79DwR++d9w0C|_oU_A{s0TilAKp`b6+(5tufsFQ-jShTU zut6H7Icg{yH`t;ETUy_=A`J6XYM6}#9HqYj+wTJgN+GI8Xq%G8=OE!-2XUmD$`_(WXUKkKb>epc_? zpViy+vwG+Ktls%_!8E-hGFf67&4cYK^i_k(OuTjZ&d3L;PZPH<{ATi#cmC(|qqpC> z1lJC?{_O((>A|V>ODJm@7XnXkH2V!p^~hd!|qo&>ivbX z5tB?`lBU}&U17it23*in9UfOF=g)D}IBKwUtije1%6>z7-LH2#f$9XJ)9KeH%YJJ2 zau~<2-Mw>O&D>p5(Lv93)ZwBYw>oznY^)uuVte4t{VInS*Zc~dE@3(0QrEYa-3dey zrNNE3c)dFxH|&6f!>1mMr398%L*-$xH2tRekGNO243=;VS#VU0=p7dv7=lKUMbe?E-sZUy zXw$ZPm^xKsrz-GLAPaLp;Y^gX_*(W{_6>L`L5;tUz&AeeMrNOA;;uqLkoQ!G$o?ZT{|VK`Q0;dp`xdg_)RqK`#b`0~ ONmLBA|A=rdj{kqrXz6PJ literal 0 HcmV?d00001 diff --git a/backend/app/core/document_parser/__pycache__/txt_parser.cpython-312.pyc b/backend/app/core/document_parser/__pycache__/txt_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f5c39baeb855ddd35f834a832243c2c0bf18ab3 GIT binary patch literal 8798 zcmcIKTXYjgmeo?X-nRUHNPYq~ma!eo8$%$x$%NNHG6_aRE7WacBg;;AD>gz(m<$Q> z;9#=^YzGr7A=zjG0cXP`CL}uvyP3>6`(sDspzV0ivL}{}&)Ksp0&|ij`?0sW)siqy zGV?KQR8_a?)~#E&y6&s_*OU|;g6DsWK5rT>MCcm~$PX23+&K=7UPPmML=&{aNz@Z! zt*BQ(t#m3|$a<2%W2BR6QPrzj)b;8XO}(Z?Tdx)S6;55fj;3gpQ{Q5!H?*YGrw~Yi z7^1mAUMx{F#`A;()u$=YCPb@`AX>v1&MRZR=?4`StsBqOO@4G~;`GspfeR%SHGBte zDY^Cbxrx_$BK<+#6vp0WAp~-Vo!@UE1?3voL4jP~)AFoWko_ z6oPK8onv;`S&m@^y;$#LIJc8`U{dlU$_s!yuR(DXBp41}R zNGlnNCK(k&)`%NhUfJOafjq8IT{OHU`^MK8m9aXtp)(`|%4n?lB*6#&If3Za9@qTJ>o z@!1GEC~R$_bm258&fNxL*4S)Kt|s1Q6SAPAt(D~&+QxM7jEjS#;RL-j!RE5JFmOU7 z$2c1Vtl#zIY+hiqpS-Rq4V&ls$*(kdcK`ps9N;kP01r^KiZr0_9%Lp)2 z;ire+ojelXG(LVBUP8VLKGI7zC(i+N{KzN4rSbB86>q7RID^hAE-25(PG%Q^`6_8n za@nj+hNQMk(~Fwrw>=5zS%Nj)w-)AX2Flad*CMZiO9z@MNyK8!qC7RuQ=nlE-@abss*W;rV#>gW?BB^DUr>fxu-v}M4F4HkH8v=q#ecg>D4Cn zs%1Wt`ZViNx6YwxL=MGXkeE;B)iswsNL^l;Bf7G@x)X}7ToPvZbTUO+KcMmHlinbM zSM61KH9x<;Od**kxzDSoQ$#JBI-sD918T4SLYn;E>_W3qE#ioF1%Y-W0{(i4u9Obt zZq!aVpbvA`4mkWQc{g(qyWVHu=g4sLT)CB~>2%&ALtcZ-2Y&12Hi43MN&Gl|;+8rT zyTRW149^*M#p-sV9Z8&8GTc#EPgfFqB;_rcKPbnqexQqwtzjFvx;TU<#8pvUiG+OX z!%LBWI&vS5tfK&>>uBWStCJs}o(%l@e?IOSAAVWtiTL|(9epSA+u+2x!xO(hcHivE z$^-3oylzS4#Sg}>o{n_CFmY-q!P8Dr?@ePX$;#HnjQ8Z#lM}BE#7S>-q@{Fs5!U8!El^FF)Z84y76tooW62tqU+a_uO5-+OulzD^4hVw5`kbX1dEN} zwtIw5K=+6ei$>63xnScWXyaBVrnk8!46K9sBKUtexd* zb=jDfHvXV3f&Dm(-y4g9#V?Z7RtsuJs|)NKFX&l&JIs~<3eV!$GK^S`3p_v)FP7Dy9Sc9*)IIT~KlE zhK8mNfpWICgZ!|Ng)|$@@L)h~@|z%#t!<1;AX%`bf*!0cV|UqjkQCVOHfIwrC_#FH znq}IYb_XMnq9jRXDpBlBVhwX;c3*(;PMz z-!RV~HP0W)51Ci=Xzr>|Mov#tAT!wURr<2wHQ}P-!G`{ZGy8k=-zibX@~8&ons4Nk zkLHvI4ux_S`Zocm*#*5@Pizg$31!XoZvYCW!oDX5ZT+_3<3kHVg-dP}t{N>|HOzgw zHdMGNl)u^k3!qY1I{0Y+qhYA_^zXS*STkB!6DpjK;obea&p^-oQ2-0(j1`pz%FZ;0 ziz;wy?iuG;#k?z2sOqsR)KKlG>0ji2miu}BJHH&JMynnRRcyJPuFXsJZ@!y@vWmlb z1viV!!?SC`g|ou5s>WuO1sczZ6?3j~yxbBjJJ%eZziedr_R;w}Zd1yNoZCiyPDV5j znP%NBG-4qeQ&PXHg>8?fFZ;i5D^c|p;`_T5C@ue6L}^pU3>n>9I=76N7hiS^m%iUH z^wdcHs_TY?=oQ zS^z#z%ig#&=kszBmzArb=Z}@t#zpEsK9qvnE47=o$}gzpn@Qyts|aZSiKL)(#C=|% zs)irFV|P68R`r69p6tWXgzO!F2cz&Re#(#YDQN}%lV>E`keR%!=auJU-lFIs&fv|_ zN}80ZVgHHE0#8iYtOstVJzh$t1%4B?3mqh<+s&>`k}U#TX^DfxHceF2*wB9R zhsjrhQ&;^d0Z@Vg$< zwW5|~YoWtqO6b_>R-_frdLN3zD4Lb2HIJodgSyMQo?ae=h^b(3N&k{F%X&0e+06x2 zmQ&dK^ogefn?u>v{*B?>!d}k_PoOf`7|LDZ-+UXCbY4_Jrss#V3VOGn*dEw?J*zrw zp53z@G+dVXyFwr`l3xA;sH6OO7akkQUG^;^Fm~R}OjA@1%^&XSM0KJ7PC%V(>)bX{ zxba`OKh^&8v!CuCF>k+a*fExA0-XT}_qfM^?-vYfRp@y7nlj~JEwnyM&sbB7J}aZv z)TlqJRO6mn<65Qi8X};5O+nzcQi>d9sHpEna1vbP&$=I+P z{Ju?IB|0MEm@8pGlh)=HuC-TUxuHQtuPEWl2#;w=8?jMxqPj+6JxcJ+N<8Md&AH?Rz z2|TC|@OgnxdwekR_A!aC1V=Ng0&xqGp5u}a3LA-2_b5_yERM+T5hj1T)sxSeSk-OqMDJCaj3y!KP*+>q3XeV70pGS0idFDIB4GS>LXo5sv;Pp9XY zcSK`mb6`2qpS$lW#LehoFs{9*G5$vr1C&pZFsjYT#urH82U1UQL+eu}LN7l40(#@d z@_g)yc@?x0E5)CoOeRq#XQWKZ6y+KyQlUShJbgbkp8E+ zfG|0}KY0}bby>)yFf(bdapNiD0!JE=BQ321jx@lL)(b&>j3e#eoFlEwiChCmddyKH zq782lJ{@qR_v+#t>Hp>&>19sj8aPV99C;O>no`}F@S5o4xFRxi_SU6S<5!PMT3~$m z-HGm=NcU+mU_F90&Z(h^moFx?gA|EMNQr2?x@78wi<1}Miwq1!&K{0auP%9F=awXB z@|EA;x^y1Lr+we4?(@KvC|Wq?X;Z^6Mb=UHgow`Waty}jBqzj}F%t!!TP z-g%EC^#EwC!LS!l_8wYL<+JW%e+BaQ6z=EwHg0v@Ll5nF+RCk5HS3ViYqdW8ch)`D zy`T%9#5nreJqxRs@3qp0mis(Ls1{Z&@>N56yAX`h$s~F)7aaEG<+mKyQt+G$46t^{224vrcuEsi zVtJieGG*Ym0n)*$FO~EHa`HR0YEWw1z#`6jWb_KG})Q9pa zfTJfnQ@&M^W}V*{)uGI6zxwOka!eo-dzu3a2J8Ck0*%4;ka_7>hGm$*+;LOU02f$# z#v3wK0fF6}Mlh9ry=Wr^onyb-bwEm`s9T?9Sf{w&j4uNvG}9j(IriF^Ts+fom3OwQzT zdgCc=2yRtBx)a!|SLuNanTznMeo{t6*GVr4RzmI8!zSapF*5Yh)Tuwj99aKLkd-J=a;Fo!Wp@Jm7^JR{ObU(Z$n`ANLIOj15Pw%zOt(?_oVepV`;c(_TYj31HrOT z(fl5LxOmRsq5ea`MWN#PJ%+J@;($JA4HeY&Xm4f~^m&56?0^2t%$jgPX`m*sJowD( z504bo_G}27iUXEW)4ZU4)bvo#x|@>P>npo%C<&KUTv!-vKfm(JvZX!i`sV)q*5O^i zLNPLCNoGZVg5#UQXJE;|-)M1O7rxQzY_^tG+6^ByZGypO zd&X^d#zxdO8+=l=*~Cw$Vorg@UbmQPX5rQ%F-gGUAcV#7E=%IV0fnH1Ghp%eLzcy0 zO6=pVgK;`-Hb~iLK`)2Z$Rko9gN+r73!GQtK7iu;Bk14AS$9b#RS``?v*+GcKS5Bb zH#2jh6mEdHGtGSqPgF<6e$dLg>^pjK#HA>t%)!}F1l7>&JGhFjGEoHqWfVabRNlc= zblD>Wl@;Wo2-idW@VZYkhd00X+#L~){z8Gw#o=7jZEYo$b2B|FN|sg2vvQ9QYVq+rq<-oy`eRYYE6C1LfScpRd-bzNg}65 L_h*D_af$y2>z~bO literal 0 HcmV?d00001 diff --git a/backend/app/services/excel_storage_service.py b/backend/app/services/excel_storage_service.py index 5f348e1..eb6d98a 100644 --- a/backend/app/services/excel_storage_service.py +++ b/backend/app/services/excel_storage_service.py @@ -246,6 +246,150 @@ class ExcelStorageService: logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}") return {"success": False, "error": str(e)} + async def store_structured_data( + self, + table_name: str, + data: Dict[str, Any], + source_doc_id: str = None + ) -> Dict[str, Any]: + """ + 将结构化数据(从非结构化文档提取的表格)存储到 MySQL + + Args: + table_name: 表名 + data: 结构化数据,格式为: + { + "columns": ["col1", "col2"], # 列名 + "rows": [["val1", "val2"], ["val3", "val4"]] # 数据行 + } + source_doc_id: 源文档 ID + + Returns: + 存储结果 + """ + results = { + "success": True, + "table_name": table_name, + "row_count": 0, + "columns": [] + } + + try: + columns = data.get("columns", []) + rows = data.get("rows", []) + + if not columns or not rows: + return {"success": False, "error": "数据为空"} + + # 清理列名 + sanitized_columns = [self._sanitize_column_name(c) for c in columns] + + # 推断列类型 + column_types = {} + for i, col in enumerate(columns): + col_values = [row[i] for row in rows if i < len(row)] + # 根据数据推断类型 + col_type = self._infer_type_from_values(col_values) + column_types[col] = col_type + results["columns"].append({ + "original_name": col, + "sanitized_name": self._sanitize_column_name(col), + "type": col_type + }) + + # 创建表 + model_class = self._create_table_model(table_name, columns, column_types) + + # 创建表结构 + async with self.mysql_db.get_session() as session: + model_class.__table__.create(session.bind, checkfirst=True) + + # 插入数据 + records = [] + for row in rows: + record = {} + for i, col in enumerate(columns): + if i >= len(row): + continue + col_name = self._sanitize_column_name(col) + value = row[i] + col_type = column_types.get(col, "TEXT") + + # 处理空值 + if value is None or str(value).strip() == '': + record[col_name] = None + elif col_type == "INTEGER": + try: + record[col_name] = int(value) + except (ValueError, TypeError): + record[col_name] = None + elif col_type == "FLOAT": + try: + record[col_name] = float(value) + except (ValueError, TypeError): + record[col_name] = None + else: + record[col_name] = str(value) + + records.append(record) + + # 批量插入 + async with self.mysql_db.get_session() as session: + for record in records: + session.add(model_class(**record)) + await session.commit() + + results["row_count"] = len(records) + logger.info(f"结构化数据已存储到 MySQL 表 {table_name},共 {len(records)} 行") + + return results + + except Exception as e: + logger.error(f"存储结构化数据到 MySQL 失败: {str(e)}") + return {"success": False, "error": str(e)} + + def _infer_type_from_values(self, values: List[Any]) -> str: + """ + 根据值列表推断列类型 + + Args: + values: 值列表 + + Returns: + 类型名称 + """ + non_null_values = [v for v in values if v is not None and str(v).strip() != ''] + if not non_null_values: + return "TEXT" + + # 检查是否全是整数 + is_integer = all(self._is_integer(v) for v in non_null_values) + if is_integer: + return "INTEGER" + + # 检查是否全是浮点数 + is_float = all(self._is_float(v) for v in non_null_values) + if is_float: + return "FLOAT" + + return "TEXT" + + def _is_integer(self, value: Any) -> bool: + """判断值是否可以转为整数""" + try: + int(value) + return True + except (ValueError, TypeError): + return False + + def _is_float(self, value: Any) -> bool: + """判断值是否可以转为浮点数""" + try: + float(value) + return True + except (ValueError, TypeError): + return False + async def query_table( self, table_name: str, diff --git a/backend/app/services/table_rag_service.py b/backend/app/services/table_rag_service.py new file mode 100644 index 0000000..4471e1d --- /dev/null +++ b/backend/app/services/table_rag_service.py @@ -0,0 +1,491 @@ +""" +表结构 RAG 索引服务 + +AI 自动生成表字段的语义描述,并建立向量索引 +""" +import logging +from typing import Any, Dict, List, Optional + +import pandas as pd + +from app.services.llm_service import llm_service +from app.services.rag_service import rag_service +from app.services.excel_storage_service import excel_storage_service +from app.core.database.mysql import mysql_db + +logger = logging.getLogger(__name__) + + +class TableRAGService: + """ + 表结构 RAG 索引服务 + + 核心功能: + 1. AI 根据表头和数据生成字段语义描述 + 2. 将字段描述存入向量数据库 (RAG) + 3. 支持自然语言查询表字段 + """ + + def __init__(self): + self.llm = llm_service + self.rag = rag_service + self.excel_storage = excel_storage_service + + async def generate_field_description( + self, + table_name: str, + field_name: str, + sample_values: List[Any], + all_fields: Dict[str, List[Any]] = None + ) -> str: + """ + 使用 AI 生成字段的语义描述 + + Args: + table_name: 表名 + field_name: 字段名 + sample_values: 字段示例值 (前10个) + all_fields: 其他字段的示例值,用于上下文理解 + + Returns: + 字段的语义描述 + """ + # 构建 Prompt + context = "" + if all_fields: + context = "\n其他字段示例:\n" + for fname, values in all_fields.items(): + if fname != field_name and values: + context += f"- {fname}: {', '.join([str(v) for v in values[:3]])}\n" + + prompt = f"""你是一个数据语义分析专家。请根据字段名和示例值,推断该字段的语义含义。 + +表名:{table_name} +字段名:{field_name} +示例值:{', '.join([str(v) for v in sample_values[:10] if v is not None])} +{context} + +请生成一段简洁的字段语义描述(不超过50字),说明: +1. 该字段代表什么含义 +2. 数据格式或单位(如果有) +3. 可能的业务用途 + +只输出描述文字,不要其他内容。""" + + try: + messages = [ + {"role": "system", "content": "你是一个专业的数据分析师。"}, + {"role": "user", "content": prompt} + ] + + response = await self.llm.chat( + messages=messages, + temperature=0.3, + max_tokens=200 + ) + + description = self.llm.extract_message_content(response) + return description.strip() + + except Exception as e: + logger.error(f"生成字段描述失败: {str(e)}") + return f"{field_name}: 数据字段" + + async def build_table_rag_index( + self, + file_path: str, + filename: str, + sheet_name: Optional[str] = None, + header_row: int = 0, + sample_size: int = 10 + ) -> Dict[str, Any]: + """ + 为 Excel 表构建完整的 RAG 索引 + + 流程: + 1. 读取 Excel 获取字段信息 + 2. AI 生成每个字段的语义描述 + 3. 将字段描述存入向量数据库 + + Args: + file_path: Excel 文件路径 + filename: 原始文件名 + sheet_name: 工作表名称 + header_row: 表头行号 + sample_size: 每个字段采样的数据条数 + + Returns: + 索引构建结果 + """ + results = { + "success": True, + "table_name": "", + "field_count": 0, + "indexed_fields": [], + "errors": [] + } + + try: + # 1. 读取 Excel + if sheet_name: + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + else: + df = pd.read_excel(file_path, header=header_row) + + if df.empty: + return {"success": False, "error": "Excel 文件为空"} + + # 清理列名 + df.columns = [str(c) for c in df.columns] + table_name = excel_storage._sanitize_table_name(filename) + results["table_name"] = table_name + results["field_count"] = len(df.columns) + + # 2. 初始化 RAG (如果需要) + if not self.rag._initialized: + self.rag._init_vector_store() + + # 3. 为每个字段生成描述并索引 + all_fields_data = {} + for col in df.columns: + # 采样示例值 + sample_values = df[col].dropna().head(sample_size).tolist() + all_fields_data[col] = sample_values + + # 批量生成描述(避免过多 API 调用) + indexed_count = 0 + for col in df.columns: + try: + sample_values = all_fields_data[col] + + # 生成描述 + description = await self.generate_field_description( + table_name=table_name, + field_name=col, + sample_values=sample_values, + all_fields=all_fields_data + ) + + # 存入 RAG + self.rag.index_field( + table_name=table_name, + field_name=col, + field_description=description, + sample_values=[str(v) for v in sample_values[:5]] + ) + + indexed_count += 1 + results["indexed_fields"].append({ + "field": col, + "description": description + }) + + logger.info(f"字段已索引: {table_name}.{col}") + + except Exception as e: + error_msg = f"字段 {col} 索引失败: {str(e)}" + logger.error(error_msg) + results["errors"].append(error_msg) + + # 4. 存储到 MySQL + store_result = await self.excel_storage.store_excel( + file_path=file_path, + filename=filename, + sheet_name=sheet_name, + header_row=header_row + ) + + if store_result.get("success"): + results["mysql_table"] = store_result.get("table_name") + results["row_count"] = store_result.get("row_count") + else: + results["mysql_warning"] = "MySQL 存储失败: " + str(store_result.get("error")) + + results["indexed_count"] = indexed_count + logger.info(f"表 {table_name} RAG 索引构建完成,共 {indexed_count} 个字段") + + return results + + except Exception as e: + logger.error(f"构建 RAG 索引失败: {str(e)}") + return {"success": False, "error": str(e)} + + async def index_document_table( + self, + doc_id: str, + filename: str, + table_data: Dict[str, Any], + source_doc_type: str + ) -> Dict[str, Any]: + """ + 为非结构化文档中提取的表格建立 MySQL 存储和 RAG 索引 + + Args: + doc_id: 源文档 ID + filename: 源文件名 + table_data: 表格数据,支持两种格式: + 1. docx/txt格式: {"rows": [["col1", "col2"], ["val1", "val2"]], ...} + 2. md格式: {"headers": [...], "rows": [...], ...} + source_doc_type: 源文档类型 (docx/md/txt) + + Returns: + 索引构建结果 + """ + results = { + "success": True, + "table_name": "", + "field_count": 0, + "indexed_fields": [], + "errors": [] + } + + try: + # 兼容两种格式 + if "headers" in table_data: + # md 格式:headers 和 rows 分开 + columns = table_data.get("headers", []) + data_rows = table_data.get("rows", []) + else: + # docx/txt 格式:第一行作为表头 + rows = table_data.get("rows", []) + if not rows or len(rows) < 2: + return {"success": False, "error": "表格数据不足"} + columns = rows[0] + data_rows = rows[1:] + + # 生成表名:源文件 + 表格索引 + base_name = self.excel_storage._sanitize_table_name(filename) + table_name = f"{base_name}_table{table_data.get('table_index', 0)}" + + results["table_name"] = table_name + results["field_count"] = len(columns) + + # 1. 初始化 RAG + if not self.rag._initialized: + self.rag._init_vector_store() + + # 2. 准备结构化数据 + structured_data = { + "columns": columns, + "rows": data_rows + } + + # 3. 存储到 MySQL + store_result = await self.excel_storage.store_structured_data( + table_name=table_name, + data=structured_data, + source_doc_id=doc_id + ) + + if store_result.get("success"): + results["mysql_table"] = store_result.get("table_name") + results["row_count"] = store_result.get("row_count") + else: + results["mysql_warning"] = "MySQL 存储失败: " + str(store_result.get("error")) + + # 4. 为每个字段生成描述并索引 + all_fields_data = {} + for i, col in enumerate(columns): + col_values = [row[i] for row in data_rows if i < len(row)] + all_fields_data[col] = col_values + + indexed_count = 0 + for col in columns: + try: + col_values = all_fields_data.get(col, []) + + # 生成描述 + description = await self.generate_field_description( + table_name=table_name, + field_name=col, + sample_values=col_values[:10], + all_fields=all_fields_data + ) + + # 存入 RAG + self.rag.index_field( + table_name=table_name, + field_name=col, + field_description=description, + sample_values=[str(v) for v in col_values[:5]] + ) + + indexed_count += 1 + results["indexed_fields"].append({ + "field": col, + "description": description + }) + + logger.info(f"文档表格字段已索引: {table_name}.{col}") + + except Exception as e: + error_msg = f"字段 {col} 索引失败: {str(e)}" + logger.error(error_msg) + results["errors"].append(error_msg) + + results["indexed_count"] = indexed_count + logger.info(f"文档表格 {table_name} RAG 索引构建完成,共 {indexed_count} 个字段") + + return results + + except Exception as e: + logger.error(f"构建文档表格 RAG 索引失败: {str(e)}") + return {"success": False, "error": str(e)} + + async def query_table_by_natural_language( + self, + user_query: str, + top_k: int = 5 + ) -> Dict[str, Any]: + """ + 根据自然语言查询相关表字段 + + Args: + user_query: 用户查询 + top_k: 返回数量 + + Returns: + 匹配的字段信息 + """ + try: + # 1. RAG 检索 + rag_results = self.rag.retrieve(user_query, top_k=top_k) + + # 2. 解析检索结果 + matched_fields = [] + for result in rag_results: + metadata = result.get("metadata", {}) + matched_fields.append({ + "table_name": metadata.get("table_name", ""), + "field_name": metadata.get("field_name", ""), + "description": result.get("content", ""), + "score": result.get("score", 0), + "sample_values": [] # 可以后续补充 + }) + + return { + "success": True, + "query": user_query, + "matched_fields": matched_fields, + "count": len(matched_fields) + } + + except Exception as e: + logger.error(f"查询失败: {str(e)}") + return {"success": False, "error": str(e)} + + async def get_table_fields_with_description( + self, + table_name: str + ) -> List[Dict[str, Any]]: + """ + 获取表的字段及其描述 + + Args: + table_name: 表名 + + Returns: + 字段列表 + """ + try: + # 从 RAG 检索该表的所有字段 + results = self.rag.retrieve_by_table(table_name, top_k=50) + + fields = [] + for result in results: + metadata = result.get("metadata", {}) + fields.append({ + "table_name": metadata.get("table_name", ""), + "field_name": metadata.get("field_name", ""), + "description": result.get("content", ""), + "score": result.get("score", 0) + }) + + return fields + + except Exception as e: + logger.error(f"获取字段失败: {str(e)}") + return [] + + async def rebuild_all_table_indexes(self) -> Dict[str, Any]: + """ + 重建所有表的 RAG 索引 + + 从 MySQL 读取所有表结构,重新生成描述并索引 + """ + try: + # 清空现有索引 + self.rag.clear() + + # 获取所有表 + tables = await self.excel_storage.list_tables() + + results = { + "success": True, + "tables_processed": 0, + "total_fields": 0, + "errors": [] + } + + for table_name in tables: + try: + # 获取表结构 + schema = await self.excel_storage.get_table_schema(table_name) + + if not schema: + continue + + # 初始化 RAG + if not self.rag._initialized: + self.rag._init_vector_store() + + # 为每个字段生成描述并索引 + for col_info in schema: + field_name = col_info.get("COLUMN_NAME", "") + if field_name in ["id", "created_at", "updated_at"]: + continue + + # 采样数据 + samples = await self.excel_storage.query_table( + table_name, + columns=[field_name], + limit=10 + ) + sample_values = [r.get(field_name) for r in samples if r.get(field_name)] + + # 生成描述 + description = await self.generate_field_description( + table_name=table_name, + field_name=field_name, + sample_values=sample_values + ) + + # 索引 + self.rag.index_field( + table_name=table_name, + field_name=field_name, + field_description=description, + sample_values=[str(v) for v in sample_values[:5]] + ) + + results["total_fields"] += 1 + + results["tables_processed"] += 1 + logger.info(f"表 {table_name} 索引重建完成") + + except Exception as e: + error_msg = f"表 {table_name} 索引失败: {str(e)}" + logger.error(error_msg) + results["errors"].append(error_msg) + + logger.info(f"全部 {results['tables_processed']} 个表索引重建完成") + return results + + except Exception as e: + logger.error(f"重建索引失败: {str(e)}") + return {"success": False, "error": str(e)} + + +# ==================== 全局单例 ==================== + +table_rag_service = TableRAGService()