langchain-chat-with-milvus/document_ai/init.py

83 lines
2.0 KiB
Python
Raw Normal View History

2023-11-15 08:20:30 +00:00
import os
from langchain.embeddings import OpenAIEmbeddings
from pymilvus import (
connections,
utility,
FieldSchema,
CollectionSchema,
DataType,
Collection,
)
# init
MILVUS_HOST = os.getenv("MILVUS_HOST") or "127.0.0.1"
MILVUS_PORT = os.getenv("MILVUS_PORT") or "19530"
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
if not utility.has_collection("leaf_documents"):
_document_id = FieldSchema(
name="document_id",
dtype=DataType.INT64,
2023-11-18 15:08:22 +00:00
)
_document_chunk_id = FieldSchema(
name="document_chunk_id",
dtype=DataType.INT64,
2023-11-15 08:20:30 +00:00
is_primary=True,
)
2023-11-18 15:08:22 +00:00
_library_id = FieldSchema(
name="library_id",
dtype=DataType.INT64,
)
2023-11-15 08:20:30 +00:00
_user_id = FieldSchema(
name="user_id",
dtype=DataType.INT64,
)
_document_vector = FieldSchema(
name="vector",
dtype=DataType.FLOAT_VECTOR,
dim=1536
)
schema = CollectionSchema(
2023-11-18 15:08:22 +00:00
fields=[_document_id, _document_chunk_id, _library_id, _user_id, _document_vector],
2023-11-15 08:20:30 +00:00
enable_dynamic_field=True
)
collection_name = "leaf_documents"
print("Create collection...")
collection = Collection(
name=collection_name,
schema=schema,
using='default',
shards_num=2
)
collection.create_index(
field_name="vector",
index_params={"metric_type": "L2", "M": 8, "efConstruction": 64, "index_type": "HNSW"},
)
collection.create_index(
field_name="user_id",
index_name="idx_user_id"
)
collection = Collection("leaf_documents")
collection.load()
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
def text_to_vector(text: str):
return embeddings.embed_query(text)
2023-11-18 15:08:22 +00:00
def insert_document(document_id: int, document_chunk_id: int, library_id: int, user_id: int, vector: list):
2023-11-15 08:20:30 +00:00
return collection.insert(
data=[
[document_id],
2023-11-18 15:08:22 +00:00
[document_chunk_id],
[library_id],
2023-11-15 08:20:30 +00:00
[user_id],
[vector]
],
).primary_keys[0]