import os
from typing import List, Tuple
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
# Load environment variables (replace with your actual API key or use a .env file)
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" # Replace with your actual API key
def load_data(data_path: str) -> str:
"""
Loads data from a file. Supports text, and markdown. For other file types,
add appropriate loaders.
Args:
data_path: Path to the data file.
Returns:
The loaded data as a string.
"""
try:
with open(data_path, "r", encoding="utf-8") as f:
data = f.read()
return data
except Exception as e:
print(f"Error loading data from {data_path}: {e}")
return ""
def chunk_data(data: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
"""
Splits the data into chunks.
Args:
data: The data to be chunked.
chunk_size: The size of each chunk.
chunk_overlap: The overlap between chunks.
Returns:
A list of text chunks.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(data)
return chunks
def create_embeddings(chunks: List[str]) -> OpenAIEmbeddings:
"""
Creates embeddings for the text chunks using OpenAI.
Args:
chunks: A list of text chunks.
Returns:
An OpenAIEmbeddings object.
"""
embeddings = OpenAIEmbeddings()
return embeddings
def create_vector_store(
chunks: List[str], embeddings: OpenAIEmbeddings
) -> FAISS:
"""
Creates a vector store from the text chunks and embeddings using FAISS.
Args:
chunks: A list of text chunks.
embeddings: An OpenAIEmbeddings object.
Returns:
A FAISS vector store.
"""
vector_store = FAISS.from_texts(chunks, embeddings)
return vector_store
def create_rag_chain(
vector_store: FAISS, llm: OpenAI = OpenAI(temperature=0)
) -> RetrievalQA:
"""
Creates a RAG chain using the vector store and a language model.
Args:
vector_store: A FAISS vector store.
llm: A language model (default: OpenAI with temperature=0).
Returns:
A RetrievalQA chain.
"""
rag_chain = RetrievalQA.from_chain_type(
llm=llm, chain_type="stuff", retriever=vector_store.as_retriever()
)
return rag_chain
def rag_query(rag_chain: RetrievalQA, query: str) -> str:
"""
Queries the RAG chain.
Args:
rag_chain: A RetrievalQA chain.
query: The query string.
Returns:
The answer from the RAG chain.
"""
answer = rag_chain.run(query)
return answer
def main(data_path: str, query: str) -> str:
"""
Main function to run the RAG process.
Args:
data_path: Path to the data file.
query: The query string.
Returns:
The answer to the query using RAG.
"""
data = load_data(data_path)
if not data:
return "No data loaded. Please check the data path."
chunks = chunk_data(data)
embeddings = create_embeddings(chunks)
vector_store = create_vector_store(chunks, embeddings)
rag_chain = create_rag_chain(vector_store)
answer = rag_query(rag_chain, query)
return answer
if __name__ == "__main__":
# Example usage
data_path = "data/my_data.txt" # Replace with your data file
query = "What is the main topic of this document?"
answer = main(data_path, query)
print(f"Query: {query}")
print(f"Answer: {answer}")
Explanation:
- Import Libraries: Imports necessary libraries, including os, typing, Langchain modules for embeddings, vector stores, text splitting, RAG chains, and LLMs.
- load_data(data_path):
- Loads data from a file.
- Supports text and markdown files. You can extend it to handle other file types.
- Handles potential file loading errors.
- chunk_data(data, chunk_size, chunk_overlap):
- Splits the input text into smaller, overlapping chunks.
- This is crucial for handling long documents and improving retrieval accuracy.
- create_embeddings(chunks):
- Generates numerical representations (embeddings) of the text chunks using OpenAI’s embedding model.
- Embeddings capture the semantic meaning of the text.
- create_vector_store(chunks, embeddings):
- Creates a vector store (FAISS) to store the text chunks and their corresponding embeddings.
- FAISS allows for efficient similarity search, which is essential for retrieval.
- create_rag_chain(vector_store, llm):
- Creates a RAG chain using Langchain’s RetrievalQA class.
- This chain combines the vector store (for retrieval) with a language model (for generation).
- The stuff chain type is used, which passes all retrieved documents to the LLM in the prompt. Other chain types are available for different use cases.
- rag_query(rag_chain, query):
- Executes a query against the RAG chain.
- The chain retrieves relevant chunks from the vector store and uses the LLM to generate an answer based on the retrieved information.
- main(data_path, query):
- Orchestrates the entire RAG process: loads data, chunks it, creates embeddings and a vector store, creates the RAG chain, and queries it.
- if __name__ == “__main__”::
- Provides an example of how to use the main function.
- Replace “data/my_data.txt” with the actual path to your data file and modify the query.
Key Points:
- Vector Database: A vector database (like FAISS, in this example) is essential for efficient retrieval of relevant information based on semantic similarity.
- Embeddings: Embeddings are numerical representations of text that capture its meaning. OpenAI’s embedding models are used here, but others are available.
- Chunking: Chunking is necessary to break down large documents into smaller, more manageable pieces that can be effectively processed by the LLM.
- RAG Chain: The RAG chain orchestrates the retrieval and generation steps, combining the capabilities of the vector store and the LLM.
- Prompt Engineering: The retrieved information is combined with the user’s query in a prompt that is passed to the LLM. Effective prompt engineering is crucial for getting good results.
Remember to:
- Replace “YOUR_OPENAI_API_KEY” with your actual OpenAI API key. Consider using a .env file for secure storage of your API key.
- Replace “data/my_data.txt” with the path to your data file.
- Modify the query to ask a question about your data.
- Install the required libraries: langchain, openai, faiss-cpu (or faiss-gpu if you have a compatible GPU). pip install langchain openai faiss-cpu