72 lines
1.9 KiB
Python
72 lines
1.9 KiB
Python
import time
|
|
|
|
import proto.documents_pb2_grpc
|
|
import proto.documents_pb2
|
|
import init
|
|
import doc_client
|
|
import sys
|
|
import signal
|
|
|
|
from threading import Thread
|
|
|
|
threads = []
|
|
|
|
def sync_documents():
|
|
while True:
|
|
chunks_response = doc_client.stub.GetNoVectorDocumentChunks(proto.documents_pb2.GetNotVectorDocumentChunksRequest()).chunks
|
|
|
|
# # get all documents with no vector
|
|
for chunk in chunks_response:
|
|
#
|
|
# # 最多不超过 10 个
|
|
# if len(threads) >= 10:
|
|
# print("线程数已满,等待 5 秒...")
|
|
# time.sleep(5)
|
|
# continue
|
|
#
|
|
# # 等待
|
|
# for t in threads:
|
|
# if t.is_alive():
|
|
# t.join()
|
|
# print("线程 " + str(t) + " 已结束。")
|
|
# threads.remove(t)
|
|
#
|
|
# # 创建线程
|
|
# print("创建线程...")
|
|
# t = Thread(target=vector_and_save, args=(chunk,))
|
|
# threads.append(t)
|
|
#
|
|
vector_and_save(chunk)
|
|
|
|
print("进入下一次循环...")
|
|
time.sleep(1 * 5)
|
|
|
|
|
|
def vector_and_save(chunk):
|
|
chunk_content = chunk.content
|
|
|
|
print("正在进行文本向量化...")
|
|
text_vector = init.text_to_vector(chunk_content)
|
|
|
|
# update vector
|
|
update_vector_response = init.insert_document(
|
|
document_id=chunk.document.id,
|
|
document_chunk_id=chunk.id,
|
|
library_id=chunk.document.library_id,
|
|
user_id=chunk.document.user_id,
|
|
vector=text_vector
|
|
)
|
|
print(update_vector_response)
|
|
|
|
# update vector_id
|
|
update_vector_id_response = doc_client.stub.UpdateDocumentChunk(proto.documents_pb2.UpdateChunkedDocumentRequest(
|
|
id=chunk.id,
|
|
vector_id=update_vector_response
|
|
))
|
|
|
|
print(update_vector_id_response)
|
|
print("向量化完成。")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sync_documents() |