43 lines
1.4 KiB
Python
43 lines
1.4 KiB
Python
|
|
# pip install langchain chromadb openai unstructured langchain-community langchain-openai langchain_chroma
|
||
|
|
import os
|
||
|
|
import openai
|
||
|
|
import chromadb
|
||
|
|
from langchain_community.document_loaders import DirectoryLoader
|
||
|
|
from langchain.text_splitter import CharacterTextSplitter
|
||
|
|
from langchain_openai import OpenAIEmbeddings
|
||
|
|
from langchain_chroma import Chroma
|
||
|
|
|
||
|
|
# Put your OpenAI api key here,
|
||
|
|
# or run script with env variables: OPENAI_API_KEY
|
||
|
|
openai.api_key = ""
|
||
|
|
|
||
|
|
# load documents
|
||
|
|
current_file_path = os.path.abspath(__file__)
|
||
|
|
current_directory_path = os.path.dirname(current_file_path)
|
||
|
|
wiki_docs_path = os.path.join(current_directory_path, "./wiki_docs")
|
||
|
|
loader = DirectoryLoader(wiki_docs_path, glob="*.txt")
|
||
|
|
documents = loader.load()
|
||
|
|
|
||
|
|
# split documents
|
||
|
|
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
||
|
|
texts = text_splitter.split_documents(documents)
|
||
|
|
|
||
|
|
# setup OpenAI
|
||
|
|
embedding_function = OpenAIEmbeddings(openai_api_key=openai.api_key)
|
||
|
|
|
||
|
|
# setup Chroma database
|
||
|
|
host = "localhost"
|
||
|
|
port = "8000"
|
||
|
|
chroma_client = chromadb.HttpClient(host= host, port= port,)
|
||
|
|
|
||
|
|
# loading docs into database
|
||
|
|
print("Loading documents with embeddings into database...")
|
||
|
|
collection_name = "china_history"
|
||
|
|
db = Chroma.from_documents(documents=texts, embedding=embedding_function, client=chroma_client, collection_name=collection_name)
|
||
|
|
print("Done")
|
||
|
|
|
||
|
|
# RAG openai
|
||
|
|
retriever = db.as_retriever()
|
||
|
|
docs = retriever.get_relevant_documents("Who is Wu Zetian?")
|
||
|
|
print(docs)
|