RAG from scratch
RAG: Retrieval-Augmented Generation¶
- R: Retrieval - Fetch the right external content.
- A: Augmentation - Modify the prompt to pass content form Retrival Stage
- G: Generation - Generate the final response using the LLM.
Programmatic Stages:¶
1. Data Ingestion:¶
- Parse PDF and extract text.
- Perform text chunking.
- Set up the database.
- Populate the database with parsed data.
2. Retrieval:¶
- Take the user query as input.
- Perform similarity search across the stored data.
- Retrieve the most relevant chunks of information.
3. Augmentation:¶
- Augment the prompt by incorporating relevant chunks of retrieved data.
- Adjust the prompt through prompt engineering to optimize for clarity and context.
4. Generation:¶
- Use the enhanced prompt to generate a response using the LLM.
Data Ingestion¶
Load Data
In [ ]:
Copied!
# !pip install -q sentence-transformers
# !pip install -q wikipedia-api
# !pip install -q numpy
# !pip install -q scipy
# !pip install openai
# !pip install rich
# !pip install pypdf2
# !pip install gradio
# !pip install -q sentence-transformers # !pip install -q wikipedia-api # !pip install -q numpy # !pip install -q scipy # !pip install openai # !pip install rich # !pip install pypdf2 # !pip install gradio
In [ ]:
Copied!
import re
import os
import openai
from rich import print
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import numpy as np
import textwrap
from wikipediaapi import Wikipedia
import PyPDF2
import re import os import openai from rich import print from dotenv import load_dotenv from sentence_transformers import SentenceTransformer import numpy as np import textwrap from wikipediaapi import Wikipedia import PyPDF2
In [ ]:
Copied!
def load_document(file_path):
"""
Load document from a given file path. Supports PDF and text files.
"""
_, file_extension = os.path.splitext(file_path)
if file_extension.lower() == '.pdf':
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
elif file_extension.lower() == '.txt':
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
elif file_extension.lower() == '.md':
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
else:
raise ValueError("Unsupported file format. Please provide a PDF or text file.")
return text
data = load_document("../data/md/attention_is_all_you_need.md")
def load_document(file_path): """ Load document from a given file path. Supports PDF and text files. """ _, file_extension = os.path.splitext(file_path) if file_extension.lower() == '.pdf': with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() elif file_extension.lower() == '.txt': with open(file_path, 'r', encoding='utf-8') as file: text = file.read() elif file_extension.lower() == '.md': with open(file_path, 'r', encoding='utf-8') as file: text = file.read() else: raise ValueError("Unsupported file format. Please provide a PDF or text file.") return text data = load_document("../data/md/attention_is_all_you_need.md")
In [ ]:
Copied!
print(data)
print(data)
Perform Chunking
In [ ]:
Copied!
def chunk_text(text, chunk_size=100, overlap=20):
"""
Split the text into chunks based on the number of words and word overlap.
"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
chunked_data = chunk_text(data)
chunked_data
def chunk_text(text, chunk_size=100, overlap=20): """ Split the text into chunks based on the number of words and word overlap. """ words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) return chunks chunked_data = chunk_text(data) chunked_data
Visualise Chunking
In [ ]:
Copied!
# Print the list of chunks
def print_chunks(chunks):
for i, chunk in enumerate(chunks):
print(f"Chunk {i + 1}:")
print(chunk)
print("-" * 50)
print_chunks(chunked_data)
# Print the list of chunks def print_chunks(chunks): for i, chunk in enumerate(chunks): print(f"Chunk {i + 1}:") print(chunk) print("-" * 50) print_chunks(chunked_data)
Setting Up embedding model
In [11]:
Copied!
# Load the sentence transformer model for embeddings
model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5")
# Load the sentence transformer model for embeddings model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5")
C:\Users\Adithya\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_deprecation.py:131: FutureWarning: 'cached_download' (from 'huggingface_hub.file_download') is deprecated and will be removed from version '0.26'. Use `hf_hub_download` instead. warnings.warn(warning_message, FutureWarning)
.gitattributes: 0%| | 0.00/1.52k [00:00<?, ?B/s]
1_Pooling/config.json: 0%| | 0.00/297 [00:00<?, ?B/s]
README.md: 0%| | 0.00/71.8k [00:00<?, ?B/s]
config.json: 0%| | 0.00/1.35k [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/547M [00:00<?, ?B/s]
model.onnx: 0%| | 0.00/556M [00:00<?, ?B/s]
model_bnb4.onnx: 0%| | 0.00/167M [00:00<?, ?B/s]
model_fp16.onnx: 0%| | 0.00/278M [00:00<?, ?B/s]
--------------------------------------------------------------------------- IncompleteRead Traceback (most recent call last) File c:\Python311\Lib\site-packages\urllib3\response.py:710, in HTTPResponse._error_catcher(self) 709 try: --> 710 yield 712 except SocketTimeout as e: 713 # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but 714 # there is yet no clean way to get at it from this context. File c:\Python311\Lib\site-packages\urllib3\response.py:835, in HTTPResponse._raw_read(self, amt) 825 if ( 826 self.enforce_content_length 827 and self.length_remaining is not None (...) 833 # raised during streaming, so all calls with incorrect 834 # Content-Length are caught. --> 835 raise IncompleteRead(self._fp_bytes_read, self.length_remaining) 837 if data: IncompleteRead: IncompleteRead(147938459 bytes read, 130273836 more expected) The above exception was the direct cause of the following exception: ProtocolError Traceback (most recent call last) File c:\Python311\Lib\site-packages\requests\models.py:820, in Response.iter_content.<locals>.generate() 819 try: --> 820 yield from self.raw.stream(chunk_size, decode_content=True) 821 except ProtocolError as e: File c:\Python311\Lib\site-packages\urllib3\response.py:936, in HTTPResponse.stream(self, amt, decode_content) 935 while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0: --> 936 data = self.read(amt=amt, decode_content=decode_content) 938 if data: File c:\Python311\Lib\site-packages\urllib3\response.py:907, in HTTPResponse.read(self, amt, decode_content, cache_content) 903 while len(self._decoded_buffer) < amt and data: 904 # TODO make sure to initially read enough data to get past the headers 905 # For example, the GZ file header takes 10 bytes, we don't want to read 906 # it one byte at a time --> 907 data = self._raw_read(amt) 908 decoded_data = self._decode(data, decode_content, flush_decoder) File c:\Python311\Lib\site-packages\urllib3\response.py:813, in HTTPResponse._raw_read(self, amt) 811 fp_closed = getattr(self._fp, "closed", False) --> 813 with self._error_catcher(): 814 data = self._fp_read(amt) if not fp_closed else b"" File c:\Python311\Lib\contextlib.py:155, in _GeneratorContextManager.__exit__(self, typ, value, traceback) 154 try: --> 155 self.gen.throw(typ, value, traceback) 156 except StopIteration as exc: 157 # Suppress StopIteration *unless* it's the same exception that 158 # was passed to throw(). This prevents a StopIteration 159 # raised inside the "with" statement from being suppressed. File c:\Python311\Lib\site-packages\urllib3\response.py:727, in HTTPResponse._error_catcher(self) 725 except (HTTPException, OSError) as e: 726 # This includes IncompleteRead. --> 727 raise ProtocolError(f"Connection broken: {e!r}", e) from e 729 # If no exception is thrown, we should avoid cleaning up 730 # unnecessarily. ProtocolError: ('Connection broken: IncompleteRead(147938459 bytes read, 130273836 more expected)', IncompleteRead(147938459 bytes read, 130273836 more expected)) During handling of the above exception, another exception occurred: ChunkedEncodingError Traceback (most recent call last) Cell In[11], line 2 1 # Load the sentence transformer model for embeddings ----> 2 model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5") File c:\Python311\Lib\site-packages\sentence_transformers\SentenceTransformer.py:87, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, cache_folder, use_auth_token) 83 model_path = os.path.join(cache_folder, model_name_or_path.replace("/", "_")) 85 if not os.path.exists(os.path.join(model_path, 'modules.json')): 86 # Download from hub with caching ---> 87 snapshot_download(model_name_or_path, 88 cache_dir=cache_folder, 89 library_name='sentence-transformers', 90 library_version=__version__, 91 ignore_files=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'], 92 use_auth_token=use_auth_token) 94 if os.path.exists(os.path.join(model_path, 'modules.json')): #Load as SentenceTransformer model 95 modules = self._load_sbert_model(model_path) File c:\Python311\Lib\site-packages\sentence_transformers\util.py:491, in snapshot_download(repo_id, revision, cache_dir, library_name, library_version, user_agent, ignore_files, use_auth_token) 486 if version.parse(huggingface_hub.__version__) >= version.parse("0.8.1"): 487 # huggingface_hub v0.8.1 introduces a new cache layout. We sill use a manual layout 488 # And need to pass legacy_cache_layout=True to avoid that a warning will be printed 489 cached_download_args['legacy_cache_layout'] = True --> 491 path = cached_download(**cached_download_args) 493 if os.path.exists(path + ".lock"): 494 os.remove(path + ".lock") File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs) 111 if check_use_auth_token: 112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs) --> 114 return fn(*args, **kwargs) File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_deprecation.py:132, in _deprecate_method.<locals>._inner_deprecate_method.<locals>.inner_f(*args, **kwargs) 130 warning_message += " " + message 131 warnings.warn(warning_message, FutureWarning) --> 132 return f(*args, **kwargs) File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py:807, in cached_download(url, library_name, library_version, cache_dir, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout) 804 cache_path = "\\\\?\\" + os.path.abspath(cache_path) 806 with WeakFileLock(lock_path): --> 807 _download_to_tmp_and_move( 808 incomplete_path=Path(cache_path + ".incomplete"), 809 destination_path=Path(cache_path), 810 url_to_download=url_to_download, 811 proxies=proxies, 812 headers=headers, 813 expected_size=expected_size, 814 filename=filename, 815 force_download=force_download, 816 ) 818 if force_filename is None: 819 logger.info("creating metadata file for %s", cache_path) File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py:1915, in _download_to_tmp_and_move(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download) 1912 _check_disk_space(expected_size, incomplete_path.parent) 1913 _check_disk_space(expected_size, destination_path.parent) -> 1915 http_get( 1916 url_to_download, 1917 f, 1918 proxies=proxies, 1919 resume_size=resume_size, 1920 headers=headers, 1921 expected_size=expected_size, 1922 ) 1924 logger.info(f"Download complete. Moving file to {destination_path}") 1925 _chmod_and_move(incomplete_path, destination_path) File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py:549, in http_get(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar) 547 new_resume_size = resume_size 548 try: --> 549 for chunk in r.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): 550 if chunk: # filter out keep-alive new chunks 551 progress.update(len(chunk)) File c:\Python311\Lib\site-packages\requests\models.py:822, in Response.iter_content.<locals>.generate() 820 yield from self.raw.stream(chunk_size, decode_content=True) 821 except ProtocolError as e: --> 822 raise ChunkedEncodingError(e) 823 except DecodeError as e: 824 raise ContentDecodingError(e) ChunkedEncodingError: ('Connection broken: IncompleteRead(147938459 bytes read, 130273836 more expected)', IncompleteRead(147938459 bytes read, 130273836 more expected))
set up similarity function
In [ ]:
Copied!
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
visualise embeddings
In [ ]:
Copied!
embed chunks
In [ ]:
Copied!
store the vectors/embedding
In [ ]:
Copied!
Retrival¶
set up vector store
In [ ]:
Copied!
similarity search
get top K results
In [ ]:
Copied!
Augmentation¶
Augmenting Prompt
In [ ]:
Copied!
modifying system prompt
Generation¶
set up llm provider
generate response