import os
from typing import List, Tuple
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import json
from langchain.prompts import PromptTemplate # Import PromptTemplate
def load_faq_data(data_path: str) -> List[Tuple[str, str]]:
"""
Loads FAQ data from a JSON file.
Args:
data_path: Path to the JSON file.
Returns:
A list of tuples, where each tuple contains a question and its answer.
"""
try:
with open(data_path, "r", encoding="utf-8") as f:
faq_data = json.load(f)
if not isinstance(faq_data, list):
raise ValueError("Expected a list of dictionaries in the JSON file.")
for item in faq_data:
if not isinstance(item, dict) or "question" not in item or "answer" not in item:
raise ValueError(
"Each item in the list should be a dictionary with 'question' and 'answer' keys."
)
return [(item["question"], item["answer"]) for item in faq_data]
except Exception as e:
print(f"Error loading FAQ data from {data_path}: {e}")
return []
def chunk_faq_data(faq_data: List[Tuple[str, str]]) -> List[str]:
"""
Splits the FAQ data into chunks. Each chunk contains one question and answer.
Args:
faq_data: A list of tuples, where each tuple contains a question and its answer.
Returns:
A list of strings, where each string is a question and answer concatenated.
"""
return [f"Question: {q}\nAnswer: {a}" for q, a in faq_data]
def create_embeddings(chunks: List[str]) -> OpenAIEmbeddings:
"""
Creates embeddings for the text chunks using OpenAI.
Args:
chunks: A list of text chunks.
Returns:
An OpenAIEmbeddings object.
"""
return OpenAIEmbeddings()
def create_vector_store(chunks: List[str], embeddings: OpenAIEmbeddings) -> FAISS:
"""
Creates a vector store from the text chunks and embeddings using FAISS.
Args:
chunks: A list of text chunks.
embeddings: An OpenAIEmbeddings object.
Returns:
A FAISS vector store.
"""
return FAISS.from_texts(chunks, embeddings)
def create_rag_chain(vector_store: FAISS, llm: OpenAI) -> RetrievalQA:
"""
Creates a RAG chain using the vector store and a language model.
Adjusted for FAQ format.
Args:
vector_store: A FAISS vector store.
llm: An OpenAI language model.
Returns:
A RetrievalQA chain.
"""
prompt_template = """Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context:
{context}
Question:
{question}
Helpful Answer:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
return RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(),
chain_type_kwargs={"prompt": PROMPT},
return_source_documents=True,
)
def rag_query(rag_chain: RetrievalQA, query: str) -> str:
"""
Queries the RAG chain.
Args:
rag_chain: A RetrievalQA chain.
query: The query string.
Returns:
The answer from the RAG chain.
"""
result = rag_chain(query)
return result["result"]
def main(data_path: str, query: str) -> str:
"""
Main function to run the RAG process with FAQ data and OpenAI.
Args:
data_path: Path to the JSON file.
query: The query string.
Returns:
The answer to the query using RAG.
"""
faq_data = load_faq_data(data_path)
if not faq_data:
return "No data loaded. Please check the data path."
chunks = chunk_faq_data(faq_data)
embeddings = create_embeddings(chunks)
vector_store = create_vector_store(chunks, embeddings)
llm = OpenAI(temperature=0)
rag_chain = create_rag_chain(vector_store, llm)
answer = rag_query(rag_chain, query)
return answer
if __name__ == "__main__":
# Example usage
data_path = "data/faq.json"
query = "What is the return policy?"
answer = main(data_path, query)
print(f"Query: {query}")
print(f"Answer: {answer}")
Code Explanation: RAG with FAQ and OpenAI
This Python code implements a Retrieval Augmented Generation (RAG) system specifically designed to answer questions from an FAQ dataset using OpenAI’s language models. Here’s a step-by-step explanation of the code:
1. Import Libraries:
os
: Used for interacting with the operating system, specifically for accessing environment variables (like your OpenAI API key).typing
: Used for type hinting, which improves code readability and helps with error checking.langchain
: A framework for developing applications powered by language models. It provides modules for various tasks, including:OpenAIEmbeddings
: For generating numerical representations (embeddings) of text using OpenAI.FAISS
: For creating and managing a vector store, which allows for efficient similarity search.RetrievalQA
: For creating a retrieval-based question answering chain.OpenAI
: For interacting with OpenAI’s language models.PromptTemplate
: For creating reusable prompt structures.
json
: For working with JSON data, as the FAQ data is expected to be in JSON format.
2. load_faq_data(data_path):
- Loads FAQ data from a JSON file.
- It expects the JSON file to contain a list of dictionaries, where each dictionary has a
"question"
and an"answer"
key. - It performs error handling to ensure the file exists and the data is in the correct format.
- It returns a list of tuples, where each tuple contains a question and its corresponding answer.
3. chunk_faq_data(faq_data):
- Prepares the FAQ data for embedding.
- Each FAQ question-answer pair is treated as a single chunk.
- It formats each question-answer pair into a string like
"Question: {q}\nAnswer: {a}"
. - It returns a list of these formatted strings.
4. create_embeddings(chunks):
- Uses OpenAI’s
OpenAIEmbeddings
to convert the text chunks (from the FAQ data) into numerical vectors (embeddings). - Embeddings capture the semantic meaning of the text.
5. create_vector_store(chunks, embeddings):
- Creates a vector store using FAISS.
- The vector store stores the text chunks along with their corresponding embeddings.
- FAISS enables efficient similarity search.
6. create_rag_chain(vector_store, llm):
- Creates the RAG chain, combining the vector store with a language model.
- It uses Langchain’s
RetrievalQA
chain:- Retrieves relevant chunks from the
vector_store
based on the query. - Feeds the retrieved chunks and the query to the
llm
(OpenAI). - The LLM generates an answer.
- Retrieves relevant chunks from the
- It uses a custom
PromptTemplate
to structure the input to the LLM, telling it to answer from the context and say “I don’t know” if the answer isn’t present. - It sets
return_source_documents=True
to include the retrieved source documents in the output.
7. rag_query(rag_chain, query):
- Takes the RAG chain and a user query as input.
- Runs the query against the chain to get the answer.
- Extracts the answer from the result.
8. main(data_path, query):
- Orchestrates the RAG process:
- Loads the FAQ data.
- Prepares the data into chunks.
- Creates embeddings and the vector store.
- Creates the RAG chain using OpenAI.
- Runs the query and prints the result.
In essence, this code automates answering questions from an FAQ by:
Using a language model to generate answers based on the most relevant FAQ entries.
Loading and formatting the FAQ data.
Converting the FAQ entries into a searchable format.
To use this code with your FAQ data:
- Create a JSON file:
- Create a JSON file (e.g.,
faq.json
) with your FAQ data in the following format:
[ {"question": "What is your return policy?", "answer": "We accept returns within 30 days of purchase."}, {"question": "How do I track my order?", "answer": "You can track your order using the tracking number provided in your shipping confirmation email."}, {"question": "What are your shipping costs?", "answer": "Shipping costs vary depending on the shipping method and destination."} ]
- Create a JSON file (e.g.,
- Replace
"data/faq.json"
:- In the
if __name__ == "__main__":
block, replace"data/faq.json"
with the actual path to your JSON file.
- In the
- Modify the query:
- Change the
query
variable to ask a question from your FAQ data.
- Change the
- Run the code:
- Run the Python script. It will load the FAQ data, create a vector store, and answer your query.
Leave a Reply