from os import environ MILVUS_HOST = "127.0.0.1" MILVUS_PORT = "19530" from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Milvus from langchain.document_loaders import WebBaseLoader from langchain.text_splitter import CharacterTextSplitter from langchain.chains.qa_with_sources import load_qa_with_sources_chain from langchain.llms import OpenAI import random from pymilvus import ( connections, utility, FieldSchema, CollectionSchema, DataType, Collection, ) # create connect connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT) # if not has book collection, create if not utility.has_collection("book"): book_id = FieldSchema( name="book_id", dtype=DataType.INT64, is_primary=True, ) book_name = FieldSchema( name="book_name", dtype=DataType.VARCHAR, max_length=200, # The default value will be used if this field is left empty during data inserts or upserts. # The data type of `default_value` must be the same as that specified in `dtype`. default_value="Unknown" ) word_count = FieldSchema( name="word_count", dtype=DataType.INT64, # The default value will be used if this field is left empty during data inserts or upserts. # The data type of `default_value` must be the same as that specified in `dtype`. default_value=9999 ) book_intro = FieldSchema( name="book_intro", dtype=DataType.FLOAT_VECTOR, dim=2 ) schema = CollectionSchema( fields=[book_id, book_name, word_count, book_intro], description="Test book search", enable_dynamic_field=True ) collection_name = "book" print("Create collection...") collection = Collection( name=collection_name, schema=schema, using='default', shards_num=2 ) data = [ [i for i in range(2000)], [str(i) for i in range(2000)], [i for i in range(10000, 12000)], [[random.random() for _ in range(2)] for _ in range(2000)], ] collection = Collection("book") # Get an existing collection. # # if not load, load # if not collection.is_loaded: # collection.load() mr = collection.insert(data) # exit exit(0) print("读取文档") loader = WebBaseLoader([ "https://ivampiresp.com/2022/10/25/nginx-dynamic-reverse-proxy-expose-intranet-http-service", ]) print("加载文档") docs = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) docs = text_splitter.split_documents(docs) print("转换为向量") # 转换为向量 embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") # # Query Milvus # vector_db = Milvus( # embedding_function=embeddings, # connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, # ) # # # 根据 url 搜索来去重 # docs = vector_db.similarity_search(query=docs, k=1) # print("存储向量") vector_db = Milvus.from_documents(docs, embedding=embeddings, connection_args={ "host": MILVUS_HOST, "port": MILVUS_PORT }) print("存储完成") # vector_db = Milvus.from_documents(docs, embedding=embeddings, connection_args={ # "uri": "https://in03-d25b13fd0ed7426.api.gcp-us-west1.zillizcloud.com", # "token": "595921e6226168e620de54ab4867392186259e784e3161b2347fbb41757423b4423edf9a6e9e14fc325bf4ff0d20d7f814b8cce9" # }) # # print("执行查询") # query = "" # # print("相似度搜索") # docs = vector_db.similarity_search(query) # # print("内容") # content = docs[0].page_content # print(content) print("提出问题") chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True) query = "首页是什么" output = chain({"input_documents": docs, "question": query}, return_only_outputs=True) print(output)