Here is a fully functioning code that will allow you to chat with your local files.

Feel free to use it for your own AI projects!

Check out other tutorials:LLMs - Resources & Tutorials


Code base: Python

Storage: FAISS vector DB

Framework: Langchain

Model: whatever default model from OpenAI Langchain uses at the moment (’text-davinci’ as of 27.07.2023)

import os
import glob
import tiktoken
from typing import List
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv

load_dotenv()

indexname = "psymoney_index"
index_path = "./" + indexname
knowledge_base_directory = ".\\docs"

# Function: Splitting PDFs to chunks
def load_and_split_Pdf(path: str) -> List[Document]:
    loader = PyPDFLoader(path)
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=24
    )
    return loader.load_and_split(text_splitter)

# Find all PDF files in the knowledge base directory
kb_pdf_files = glob.glob(os.path.join(knowledge_base_directory, '*.pdf'))

# Define embedding model - feel free to switch to anything else than OpenAI
embeddings = OpenAIEmbeddings()

# Create Knowledge Base (vector db) if it doesn't exist. Add the first file
if not os.path.exists(index_path):
    if kb_pdf_files:
        firstFile = kb_pdf_files[0]
        print("Adding " + firstFile + "to the Knowledge Base")
        texts = load_and_split_Pdf(firstFile)
        db = FAISS.from_documents(texts, embeddings)
        db.save_local(indexname)

# Load the Knowledge Base
index = FAISS.load_local(index_path, embeddings)

# Load file names used in the knowledge base
file_list = set()
for docList in index.docstore.__dict__.values():
    for dKey in docList:
        file_list.add(docList[dKey].metadata['source'])

# Get newly added PDFs
new_kb_files = {file for file in kb_pdf_files if file not in file_list}

# Add files to the Knowledge Base
if new_kb_files:
    for new_file in new_kb_files:
        print("Adding " + new_file + "to the Knowledge Base")
        texts = load_and_split_Pdf(new_file)
        db = FAISS.from_documents(texts, embeddings)
        index.merge_from(db)
        index.save_local(indexname)

# Define language model - Using Open AI model, feel free to switch to another one
llm = OpenAI(temperature=0.2)

# Define memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa = ConversationalRetrievalChain.from_llm(llm, index.as_retriever(), max_tokens_limit=400, memory=memory)

# Chatbot loop
print("Welcome to the chatbot! Type 'exit' to stop.")
while True:
    query = input("Please enter your question: ")
    
    if query.lower() == 'exit':
        break
    result = qa({"question": query})
    print("Answer:", result['answer'])
    print()

Notes:

Prerequisites:

OPENAI_API_KEY=sk-...

Untitled