diff --git a/document_ai/server.py b/document_ai/server.py index 4ed68d7..0ccf6f4 100644 --- a/document_ai/server.py +++ b/document_ai/server.py @@ -1,5 +1,9 @@ import os from concurrent import futures + +import langchain +from langchain.text_splitter import RecursiveCharacterTextSplitter + import document_query_pb2 import document_query_pb2_grpc import grpc @@ -11,6 +15,10 @@ from langchain.schema.document import Document from langchain.embeddings import OpenAIEmbeddings from langchain.chains.qa_with_sources import load_qa_with_sources_chain +from langchain.cache import InMemoryCache + +langchain.llm_cache = InMemoryCache() + class AIServer(document_query_pb2_grpc.DocumentQuery): def Query(self, request, context): @@ -50,17 +58,27 @@ class AIServer(document_query_pb2_grpc.DocumentQuery): except Exception as e: print(e) - print(real_document) + # print(real_document) + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0) + all_splits = text_splitter.split_documents(real_document) + + print("real_document: ", all_splits) + + # 文档长度 + # print("文档长度: ", len(all_splits)) print("正在调用 LLM: " + question + "...") - chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="map_reduce", + + chain = load_qa_with_sources_chain(OpenAI(temperature=0, max_tokens=4097), chain_type="map_reduce", return_intermediate_steps=False, verbose=False) - output = chain({"input_documents": real_document, "question": question}, return_only_outputs=False) + output = chain({"input_documents": all_splits, "question": question}, return_only_outputs=False) print("回复:" + output["output_text"]) return document_query_pb2.QueryResponse( text=output["output_text"] + # text = "test" ) @@ -75,5 +93,3 @@ def serve(): server.add_insecure_port(_ADDR) server.start() server.wait_for_termination() - - diff --git a/document_ai/worker.py b/document_ai/worker.py index 1ceceb6..a4fcc75 100644 --- a/document_ai/worker.py +++ b/document_ai/worker.py @@ -32,4 +32,4 @@ def sync_documents(): print(update_vector_id_response) print("更新向量完成") - time.sleep(1 * 60) + time.sleep(1 * 5) diff --git a/import_from_wordpress.py b/import_from_wordpress.py new file mode 100644 index 0000000..90c9864 --- /dev/null +++ b/import_from_wordpress.py @@ -0,0 +1,40 @@ +import html2text +import requests + +wordpress_url = "https://ivampiresp.com" + +api_url = wordpress_url + "/wp-json/wp/v2/posts" + +leaf_api_url = "http://localhost:8080/api/documents" + +jwt = "eyJpc3MiOiJvYXV0aCIsImlhdCI6MTY5OTk2OTQyOCwiZXhwIjoxNzAwNDAxNDI4LCJuYmYiOjE2OTk5Njk0MjgsImp0aSI6IjJXZUJtQWdVZGtPRUQ2am0iLCJzdWIiOiIyIiwicHJ2IjoiMjNiZDVjODk0OWY2MDBhZGIzOWU3MDFjNDAwODcyZGI3YTU5NzZmNyIsInRlYW1faWQiOm51bGwsInVzZXIiOnsiaWQiOjIsInV1aWQiOiI0NmY0ODNkYi03Y2M3LTQwYmUtOTljZC04NTY4NThiYTg3YmUiLCJuYW1lIjoiMSIsImVtYWlsIjoiaW1AaXZhbXBpcmVzcC5jb20iLCJlbWFpbF92ZXJpZmllZF9hdCI6IjIwMjMtMTEtMDVUMTE6Mzc6NTcuMDAwMDAwWiIsInJlYWxfbmFtZV92ZXJpZmllZF9hdCI6bnVsbH0sImFwcCI6InRvZG8ifQ" + +# 获取全部文章 +res = requests.get(api_url) +res_json = res.json() + +for i in range(len(res_json)): + title = res_json[i]["title"]["rendered"] + + post_id = res_json[i]["id"] + url = res_json[i]["link"] + + text = html2text.HTML2Text().handle(res_json[i]["content"]["rendered"]) + content = f""" +文章ID: {post_id} +链接: {url} +--- +{text} + """ + + result = requests.post(leaf_api_url, json={ + "Title": title, + "Content": content, + "LibraryId": 9 + }, headers={ + # "Authorization": f"Bearer {jwt}" + "X-Jwt-Payload": f"{jwt}" + }) + + print(result.json()) +