langchain-chat-with-milvus/milvus.py

from os import environ

MILVUS_HOST = "127.0.0.1"
MILVUS_PORT = "19530"

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Milvus
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

import random
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

# create connect
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

# if not has book collection, create
if not utility.has_collection("book"):
    book_id = FieldSchema(
        name="book_id",
        dtype=DataType.INT64,
        is_primary=True,
    )
    book_name = FieldSchema(
        name="book_name",
        dtype=DataType.VARCHAR,
        max_length=200,
        # The default value will be used if this field is left empty during data inserts or upserts.
        # The data type of `default_value` must be the same as that specified in `dtype`.
        default_value="Unknown"
    )
    word_count = FieldSchema(
        name="word_count",
        dtype=DataType.INT64,
        # The default value will be used if this field is left empty during data inserts or upserts.
        # The data type of `default_value` must be the same as that specified in `dtype`.
        default_value=9999
    )
    book_intro = FieldSchema(
        name="book_intro",
        dtype=DataType.FLOAT_VECTOR,
        dim=2
    )
    schema = CollectionSchema(
        fields=[book_id, book_name, word_count, book_intro],
        description="Test book search",
        enable_dynamic_field=True
    )
    collection_name = "book"
    print("Create collection...")
    collection = Collection(
        name=collection_name,
        schema=schema,
        using='default',
        shards_num=2
    )

data = [
    [i for i in range(2000)],
    [str(i) for i in range(2000)],
    [i for i in range(10000, 12000)],
    [[random.random() for _ in range(2)] for _ in range(2000)],
]
collection = Collection("book")  # Get an existing collection.

# # if not load, load
# if not collection.is_loaded:
#     collection.load()

mr = collection.insert(data)

# exit
exit(0)

print("读取文档")

loader = WebBaseLoader([
    "https://ivampiresp.com/2022/10/25/nginx-dynamic-reverse-proxy-expose-intranet-http-service",
])

print("加载文档")
docs = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)

docs = text_splitter.split_documents(docs)

print("转换为向量")
# 转换为向量
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# # Query Milvus
# vector_db = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
# )
#
# # 根据 url 搜索来去重
# docs = vector_db.similarity_search(query=docs, k=1)
#


print("存储向量")
vector_db = Milvus.from_documents(docs, embedding=embeddings, connection_args={
    "host": MILVUS_HOST, "port": MILVUS_PORT
})
print("存储完成")

# vector_db = Milvus.from_documents(docs, embedding=embeddings, connection_args={
#     "uri": "https://in03-d25b13fd0ed7426.api.gcp-us-west1.zillizcloud.com",
#     "token": "595921e6226168e620de54ab4867392186259e784e3161b2347fbb41757423b4423edf9a6e9e14fc325bf4ff0d20d7f814b8cce9"
# })

#
# print("执行查询")
# query = ""
#
# print("相似度搜索")
# docs = vector_db.similarity_search(query)
#
# print("内容")
# content = docs[0].page_content
# print(content)

print("提出问题")
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
query = "首页是什么"

output = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
print(output)