Implementation
RAG from Scratch¶
Setup¶
In [ ]:
Copied!
!pip install -q sentence-transformers
!pip install -q wikipedia-api
!pip install -q numpy
!pip install -q scipy
!pip install -q sentence-transformers
!pip install -q wikipedia-api
!pip install -q numpy
!pip install -q scipy
Load the Embedding Model:¶
In [ ]:
Copied!
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
Fetch Text Content from Wikipedia:¶
In [ ]:
Copied!
from wikipediaapi import Wikipedia
wiki = Wikipedia('RAGBot/0.0', 'en')
doc = wiki.page('Hayao_Miyazaki').text
paragraphs = doc.split('\n\n') # chunking
from wikipediaapi import Wikipedia
wiki = Wikipedia('RAGBot/0.0', 'en')
doc = wiki.page('Hayao_Miyazaki').text
paragraphs = doc.split('\n\n') # chunking
In [ ]:
Copied!
import textwrap
import textwrap
In [ ]:
Copied!
for i, p in enumerate(paragraphs):
wrapped_text = textwrap.fill(p, width=100)
print("-----------------------------------------------------------------")
print(wrapped_text)
print("-----------------------------------------------------------------")
for i, p in enumerate(paragraphs):
wrapped_text = textwrap.fill(p, width=100)
print("-----------------------------------------------------------------")
print(wrapped_text)
print("-----------------------------------------------------------------")
Embed the Document:¶
In [ ]:
Copied!
docs_embed = model.encode(paragraphs, normalize_embeddings=True)
docs_embed = model.encode(paragraphs, normalize_embeddings=True)
In [ ]:
Copied!
docs_embed.shape
docs_embed.shape
In [ ]:
Copied!
docs_embed[0]
docs_embed[0]
Embed the Query:¶
In [ ]:
Copied!
query = "What was Studio Ghibli's first film?"
query_embed = model.encode(query, normalize_embeddings=True)
query = "What was Studio Ghibli's first film?"
query_embed = model.encode(query, normalize_embeddings=True)
In [ ]:
Copied!
query_embed.shape
query_embed.shape
Find the Closest Paragraphs to the Query:¶
In [ ]:
Copied!
import numpy as np
similarities = np.dot(docs_embed, query_embed.T)
import numpy as np
similarities = np.dot(docs_embed, query_embed.T)
In [ ]:
Copied!
similarities.shape
similarities.shape
In [ ]:
Copied!
similarities
similarities
In [ ]:
Copied!
top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()
top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()
In [ ]:
Copied!
top_3_idx
top_3_idx
In [ ]:
Copied!
most_similar_documents = [paragraphs[idx] for idx in top_3_idx]
most_similar_documents = [paragraphs[idx] for idx in top_3_idx]
In [ ]:
Copied!
CONTEXT = ""
for i, p in enumerate(most_similar_documents):
wrapped_text = textwrap.fill(p, width=100)
print("-----------------------------------------------------------------")
print(wrapped_text)
print("-----------------------------------------------------------------")
CONTEXT += wrapped_text + "\n\n"
CONTEXT = ""
for i, p in enumerate(most_similar_documents):
wrapped_text = textwrap.fill(p, width=100)
print("-----------------------------------------------------------------")
print(wrapped_text)
print("-----------------------------------------------------------------")
CONTEXT += wrapped_text + "\n\n"
In [ ]:
Copied!
query = "What was Studio Ghibli's first film?"
query = "What was Studio Ghibli's first film?"
In [ ]:
Copied!
prompt = f"""
use the following CONTEXT to answer the QUESTION at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
CONTEXT: {CONTEXT}
QUESTION: {query}
"""
prompt = f"""
use the following CONTEXT to answer the QUESTION at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
CONTEXT: {CONTEXT}
QUESTION: {query}
"""
In [ ]:
Copied!
!pip install -q openai
!pip install -q openai
In [ ]:
Copied!
# prompt: write python code to make calls to openai api
from google.colab import userdata
userdata.get('openai')
import openai
# prompt: write python code to make calls to openai api
from google.colab import userdata
userdata.get('openai')
import openai
In [ ]:
Copied!
from openai import OpenAI
client = OpenAI(api_key=userdata.get('openai'))
from openai import OpenAI
client = OpenAI(api_key=userdata.get('openai'))
In [ ]:
Copied!
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": prompt},
]
)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": prompt},
]
)
In [ ]:
Copied!
print(response.choices[0].message.content)
print(response.choices[0].message.content)
In [ ]:
Copied!