import os
from typing import List, Tuple
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI, HuggingFacePipeline # Import HuggingFacePipeline
from transformers import pipeline # Import pipeline from transformers
# Load environment variables (replace with your actual API key or use a .env file)
# os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" # Remove OpenAI API key
# No longer needed, but keep for user's reference, in case they want to switch back.
def load_data(data_path: str) -> str:
"""
Loads data from a file. Supports text, and markdown. For other file types,
add appropriate loaders.
Args:
data_path: Path to the data file.
Returns:
The loaded data as a string.
"""
try:
with open(data_path, "r", encoding="utf-8") as f:
data = f.read()
return data
except Exception as e:
print(f"Error loading data from {data_path}: {e}")
return ""
def chunk_data(data: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
"""
Splits the data into chunks.
Args:
data: The data to be chunked.
chunk_size: The size of each chunk.
chunk_overlap: The overlap between chunks.
Returns:
A list of text chunks.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(data)
return chunks
def create_embeddings(chunks: List[str]) -> OpenAIEmbeddings:
"""
Creates embeddings for the text chunks using OpenAI.
Args:
chunks: A list of text chunks.
Returns:
An OpenAIEmbeddings object.
"""
embeddings = OpenAIEmbeddings() # Still using OpenAI embeddings for now,
return embeddings # but could be replaced with a local alternative.
def create_vector_store(
chunks: List[str], embeddings: OpenAIEmbeddings
) -> FAISS:
"""
Creates a vector store from the text chunks and embeddings using FAISS.
Args:
chunks: A list of text chunks.
embeddings: An OpenAIEmbeddings object.
Returns:
A FAISS vector store.
"""
vector_store = FAISS.from_texts(chunks, embeddings)
return vector_store
def create_rag_chain(
vector_store: FAISS,
llm, # Type hint as base LLM, can be either OpenAI or HuggingFacePipeline
) -> RetrievalQA:
"""
Creates a RAG chain using the vector store and a language model.
Args:
vector_store: A FAISS vector store.
llm: A language model (OpenAI or HuggingFace pipeline).
Returns:
A RetrievalQA chain.
"""
rag_chain = RetrievalQA.from_chain_type(
llm=llm, chain_type="stuff", retriever=vector_store.as_retriever()
)
return rag_chain
def rag_query(rag_chain: RetrievalQA, query: str) -> str:
"""
Queries the RAG chain.
Args:
rag_chain: A RetrievalQA chain.
query: The query string.
Returns:
The answer from the RAG chain.
"""
answer = rag_chain.run(query)
return answer
def main(data_path: str, query: str, use_local_llm: bool = False) -> str:
"""
Main function to run the RAG process. Now supports local LLMs.
Args:
data_path: Path to the data file.
query: The query string.
use_local_llm: Flag to use a local LLM (Hugging Face).
If False, uses OpenAI. Defaults to False.
Returns:
The answer to the query using RAG.
"""
data = load_data(data_path)
if not data:
return "No data loaded. Please check the data path."
chunks = chunk_data(data)
embeddings = create_embeddings(chunks)
vector_store = create_vector_store(chunks, embeddings)
if use_local_llm:
# Example of using a local LLM from Hugging Face.
# You'll need to choose a model and ensure you have the
# necessary libraries installed (transformers, etc.).
# This example uses a small, fast model; you'll likely want
# a larger one for better quality. You may need to adjust
# the model name and device (CPU/GPU) depending on your system.
local_llm = pipeline(
"text-generation",
model="distilgpt2", # A small, fast model for demonstration.
device="cpu", # Use "cuda" for GPU if available.
max_length=200, # Limit the output length.
)
llm = HuggingFacePipeline(pipeline=local_llm)
else:
llm = OpenAI(temperature=0) # Use OpenAI if use_local_llm is False
rag_chain = create_rag_chain(vector_store, llm)
answer = rag_query(rag_chain, query)
return answer
if __name__ == "__main__":
# Example usage
data_path = "data/my_data.txt" # Replace with your data file
query = "What is the main topic of this document?"
use_local_llm = True # Set to True to use a local LLM, False for OpenAI
answer = main(data_path, query, use_local_llm)
print(f"Query: {query}")
print(f"Answer: {answer}")
Sample code to enable running the LLM locally. This will involve using a local LLM instead of OpenAI.
Key Changes:
- Imported HuggingFacePipeline and pipeline: These are needed to load and use a local LLM from Hugging Face.
- Conditional LLM Loading: The main function now takes a use_local_llm argument. It uses an if statement to choose between loading an OpenAI LLM or a local Hugging Face LLM.
- Hugging Face Pipeline Example: The code includes an example of how to load and configure a local LLM using the pipeline function from transformers. This example uses distilgpt2, a small, fast model for demonstration purposes. You’ll likely want to replace this with a more capable model.
- device Argument: The device argument in the pipeline function is set to “cpu”. If you have a GPU, change this to “cuda” for significantly faster performance.
- Removed OpenAI Key Requirement: The os.environ[“OPENAI_API_KEY”] line has been commented out because it’s no longer needed when using a local LLM. I’ve kept it in the code, commented out, as a helpful reminder for users who may want to switch back to using OpenAI.
- Added use_local_llm to main and if __name__: The main function now accepts a boolean use_local_llm argument to determine whether to use a local LLM or OpenAI. The example usage in if __name__ now includes setting this flag.
To run this code with a local LLM:
- Install transformers: If you don’t have it already, install the transformers library: pip install transformers.
- Choose a Model: Select a suitable LLM from Hugging Face (https://huggingface.co/models). The example code uses “distilgpt2”, but you’ll likely want a larger, more powerful model for better results. Consider models like gpt-2, gpt-j, or others that fit your hardware and needs.
- Modify Model Name: Replace “distilgpt2” in the code with the name of the model you’ve chosen.
- Set Device: If you have a GPU, change device=”cpu” to device=”cuda” for faster inference.
- Data Path and Query: Make sure data_path points to your data file and that query contains the question you want to ask.
- Run the Code: Run the Python script. The first time you run it with a new model, it will download the model files, which may take some time.
Important Considerations:
- Model Size and Hardware: Local LLMs can be very large, and running them efficiently requires significant hardware resources, especially RAM and GPU memory. Choose a model that fits your system’s capabilities.
- Dependencies: Ensure you have all the necessary libraries installed, including transformers, torch (if using a GPU), and any other dependencies required by the specific model you choose.
- Performance: Local LLMs may run slower than cloud-based LLMs like OpenAI, especially if you don’t have a powerful GPU.
- Accuracy: The accuracy and quality of the results will depend on the specific local LLM you choose. Smaller, faster models may not be as accurate as larger ones.