Here is a fully functioning code that will allow you to chat with your local files.
Feel free to use it for your own AI projects!
Check out other tutorials:LLMs - Resources & Tutorials
Code base: Python
Storage: FAISS vector DB
Framework: Langchain
Model: whatever default model from OpenAI Langchain uses at the moment (’text-davinci’ as of 27.07.2023)
import os
import glob
import tiktoken
from typing import List
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv
load_dotenv()
indexname = "psymoney_index"
index_path = "./" + indexname
knowledge_base_directory = ".\\docs"
# Function: Splitting PDFs to chunks
def load_and_split_Pdf(path: str) -> List[Document]:
loader = PyPDFLoader(path)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=512, chunk_overlap=24
)
return loader.load_and_split(text_splitter)
# Find all PDF files in the knowledge base directory
kb_pdf_files = glob.glob(os.path.join(knowledge_base_directory, '*.pdf'))
# Define embedding model - feel free to switch to anything else than OpenAI
embeddings = OpenAIEmbeddings()
# Create Knowledge Base (vector db) if it doesn't exist. Add the first file
if not os.path.exists(index_path):
if kb_pdf_files:
firstFile = kb_pdf_files[0]
print("Adding " + firstFile + "to the Knowledge Base")
texts = load_and_split_Pdf(firstFile)
db = FAISS.from_documents(texts, embeddings)
db.save_local(indexname)
# Load the Knowledge Base
index = FAISS.load_local(index_path, embeddings)
# Load file names used in the knowledge base
file_list = set()
for docList in index.docstore.__dict__.values():
for dKey in docList:
file_list.add(docList[dKey].metadata['source'])
# Get newly added PDFs
new_kb_files = {file for file in kb_pdf_files if file not in file_list}
# Add files to the Knowledge Base
if new_kb_files:
for new_file in new_kb_files:
print("Adding " + new_file + "to the Knowledge Base")
texts = load_and_split_Pdf(new_file)
db = FAISS.from_documents(texts, embeddings)
index.merge_from(db)
index.save_local(indexname)
# Define language model - Using Open AI model, feel free to switch to another one
llm = OpenAI(temperature=0.2)
# Define memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm, index.as_retriever(), max_tokens_limit=400, memory=memory)
# Chatbot loop
print("Welcome to the chatbot! Type 'exit' to stop.")
while True:
query = input("Please enter your question: ")
if query.lower() == 'exit':
break
result = qa({"question": query})
print("Answer:", result['answer'])
print()
OPENAI_API_KEY=sk-...