RAG from scratch

RAG: Retrieval-Augmented Generation¶

R: Retrieval - Fetch the right external content.
A: Augmentation - Modify the prompt to pass content form Retrival Stage
G: Generation - Generate the final response using the LLM.

Programmatic Stages:¶

1. Data Ingestion:¶

Parse PDF and extract text.
Perform text chunking.
Set up the database.
Populate the database with parsed data.

2. Retrieval:¶

Take the user query as input.
Perform similarity search across the stored data.
Retrieve the most relevant chunks of information.

3. Augmentation:¶

Augment the prompt by incorporating relevant chunks of retrieved data.
Adjust the prompt through prompt engineering to optimize for clarity and context.

4. Generation:¶

Use the enhanced prompt to generate a response using the LLM.

Data Ingestion¶

Load Data

In [ ]:

  Copied!     
 
# !pip install -q sentence-transformers
# !pip install -q wikipedia-api
# !pip install -q numpy
# !pip install -q scipy
# !pip install openai
# !pip install rich
# !pip install pypdf2
# !pip install gradio
# !pip install -q sentence-transformers # !pip install -q wikipedia-api # !pip install -q numpy # !pip install -q scipy # !pip install openai # !pip install rich # !pip install pypdf2 # !pip install gradio

In [ ]:

  Copied!     
 
import re
import os
import openai
from rich import print
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import numpy as np
import textwrap
from wikipediaapi import Wikipedia
import PyPDF2
import re import os import openai from rich import print from dotenv import load_dotenv from sentence_transformers import SentenceTransformer import numpy as np import textwrap from wikipediaapi import Wikipedia import PyPDF2

In [ ]:

  Copied!     
 
def load_document(file_path):
    """
    Load document from a given file path. Supports PDF and text files.
    """
    _, file_extension = os.path.splitext(file_path)
    
    if file_extension.lower() == '.pdf':
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
    elif file_extension.lower() == '.txt':
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    elif file_extension.lower() == '.md':
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or text file.")
    
    return text

data = load_document("../data/md/attention_is_all_you_need.md")
def load_document(file_path): """ Load document from a given file path. Supports PDF and text files. """ _, file_extension = os.path.splitext(file_path) if file_extension.lower() == '.pdf': with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() elif file_extension.lower() == '.txt': with open(file_path, 'r', encoding='utf-8') as file: text = file.read() elif file_extension.lower() == '.md': with open(file_path, 'r', encoding='utf-8') as file: text = file.read() else: raise ValueError("Unsupported file format. Please provide a PDF or text file.") return text data = load_document("../data/md/attention_is_all_you_need.md")

In [ ]:

  Copied!     
 
print(data)
print(data)

Perform Chunking

In [ ]:

  Copied!     
 
def chunk_text(text, chunk_size=100, overlap=20):
    """
    Split the text into chunks based on the number of words and word overlap.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunked_data = chunk_text(data)

chunked_data
def chunk_text(text, chunk_size=100, overlap=20): """ Split the text into chunks based on the number of words and word overlap. """ words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) return chunks chunked_data = chunk_text(data) chunked_data

Visualise Chunking

In [ ]:

  Copied!     
 
# Print the list of chunks
def print_chunks(chunks):
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i + 1}:")
        print(chunk)
        print("-" * 50)
        
print_chunks(chunked_data)
# Print the list of chunks def print_chunks(chunks): for i, chunk in enumerate(chunks): print(f"Chunk {i + 1}:") print(chunk) print("-" * 50) print_chunks(chunked_data)

Setting Up embedding model

In [11]:

  Copied!     
 
# Load the sentence transformer model for embeddings
model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5")
# Load the sentence transformer model for embeddings model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5")

C:\Users\Adithya\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_deprecation.py:131: FutureWarning: 'cached_download' (from 'huggingface_hub.file_download') is deprecated and will be removed from version '0.26'. Use `hf_hub_download` instead.
  warnings.warn(warning_message, FutureWarning)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/556M [00:00<?, ?B/s]

model_bnb4.onnx:   0%|          | 0.00/167M [00:00<?, ?B/s]

model_fp16.onnx:   0%|          | 0.00/278M [00:00<?, ?B/s]

---------------------------------------------------------------------------
IncompleteRead                            Traceback (most recent call last)
File c:\Python311\Lib\site-packages\urllib3\response.py:710, in HTTPResponse._error_catcher(self)
    709 try:
--> 710     yield
    712 except SocketTimeout as e:
    713     # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but
    714     # there is yet no clean way to get at it from this context.

File c:\Python311\Lib\site-packages\urllib3\response.py:835, in HTTPResponse._raw_read(self, amt)
    825         if (
    826             self.enforce_content_length
    827             and self.length_remaining is not None
   (...)
    833             # raised during streaming, so all calls with incorrect
    834             # Content-Length are caught.
--> 835             raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
    837 if data:

IncompleteRead: IncompleteRead(147938459 bytes read, 130273836 more expected)

The above exception was the direct cause of the following exception:

ProtocolError                             Traceback (most recent call last)
File c:\Python311\Lib\site-packages\requests\models.py:820, in Response.iter_content.<locals>.generate()
    819 try:
--> 820     yield from self.raw.stream(chunk_size, decode_content=True)
    821 except ProtocolError as e:

File c:\Python311\Lib\site-packages\urllib3\response.py:936, in HTTPResponse.stream(self, amt, decode_content)
    935 while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0:
--> 936     data = self.read(amt=amt, decode_content=decode_content)
    938     if data:

File c:\Python311\Lib\site-packages\urllib3\response.py:907, in HTTPResponse.read(self, amt, decode_content, cache_content)
    903 while len(self._decoded_buffer) < amt and data:
    904     # TODO make sure to initially read enough data to get past the headers
    905     # For example, the GZ file header takes 10 bytes, we don't want to read
    906     # it one byte at a time
--> 907     data = self._raw_read(amt)
    908     decoded_data = self._decode(data, decode_content, flush_decoder)

File c:\Python311\Lib\site-packages\urllib3\response.py:813, in HTTPResponse._raw_read(self, amt)
    811 fp_closed = getattr(self._fp, "closed", False)
--> 813 with self._error_catcher():
    814     data = self._fp_read(amt) if not fp_closed else b""

File c:\Python311\Lib\contextlib.py:155, in _GeneratorContextManager.__exit__(self, typ, value, traceback)
    154 try:
--> 155     self.gen.throw(typ, value, traceback)
    156 except StopIteration as exc:
    157     # Suppress StopIteration *unless* it's the same exception that
    158     # was passed to throw().  This prevents a StopIteration
    159     # raised inside the "with" statement from being suppressed.

File c:\Python311\Lib\site-packages\urllib3\response.py:727, in HTTPResponse._error_catcher(self)
    725 except (HTTPException, OSError) as e:
    726     # This includes IncompleteRead.
--> 727     raise ProtocolError(f"Connection broken: {e!r}", e) from e
    729 # If no exception is thrown, we should avoid cleaning up
    730 # unnecessarily.

ProtocolError: ('Connection broken: IncompleteRead(147938459 bytes read, 130273836 more expected)', IncompleteRead(147938459 bytes read, 130273836 more expected))

During handling of the above exception, another exception occurred:

ChunkedEncodingError                      Traceback (most recent call last)
Cell In[11], line 2
      1 # Load the sentence transformer model for embeddings
----> 2 model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5")

File c:\Python311\Lib\site-packages\sentence_transformers\SentenceTransformer.py:87, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, cache_folder, use_auth_token)
     83     model_path = os.path.join(cache_folder, model_name_or_path.replace("/", "_"))
     85     if not os.path.exists(os.path.join(model_path, 'modules.json')):
     86         # Download from hub with caching
---> 87         snapshot_download(model_name_or_path,
     88                             cache_dir=cache_folder,
     89                             library_name='sentence-transformers',
     90                             library_version=__version__,
     91                             ignore_files=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'],
     92                             use_auth_token=use_auth_token)
     94 if os.path.exists(os.path.join(model_path, 'modules.json')):    #Load as SentenceTransformer model
     95     modules = self._load_sbert_model(model_path)

File c:\Python311\Lib\site-packages\sentence_transformers\util.py:491, in snapshot_download(repo_id, revision, cache_dir, library_name, library_version, user_agent, ignore_files, use_auth_token)
    486 if version.parse(huggingface_hub.__version__) >= version.parse("0.8.1"):
    487     # huggingface_hub v0.8.1 introduces a new cache layout. We sill use a manual layout
    488     # And need to pass legacy_cache_layout=True to avoid that a warning will be printed
    489     cached_download_args['legacy_cache_layout'] = True
--> 491 path = cached_download(**cached_download_args)
    493 if os.path.exists(path + ".lock"):
    494     os.remove(path + ".lock")

File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    111 if check_use_auth_token:
    112     kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)

File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\utils\_deprecation.py:132, in _deprecate_method.<locals>._inner_deprecate_method.<locals>.inner_f(*args, **kwargs)
    130     warning_message += " " + message
    131 warnings.warn(warning_message, FutureWarning)
--> 132 return f(*args, **kwargs)

File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py:807, in cached_download(url, library_name, library_version, cache_dir, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)
    804     cache_path = "\\\\?\\" + os.path.abspath(cache_path)
    806 with WeakFileLock(lock_path):
--> 807     _download_to_tmp_and_move(
    808         incomplete_path=Path(cache_path + ".incomplete"),
    809         destination_path=Path(cache_path),
    810         url_to_download=url_to_download,
    811         proxies=proxies,
    812         headers=headers,
    813         expected_size=expected_size,
    814         filename=filename,
    815         force_download=force_download,
    816     )
    818     if force_filename is None:
    819         logger.info("creating metadata file for %s", cache_path)

File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py:1915, in _download_to_tmp_and_move(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download)
   1912         _check_disk_space(expected_size, incomplete_path.parent)
   1913         _check_disk_space(expected_size, destination_path.parent)
-> 1915     http_get(
   1916         url_to_download,
   1917         f,
   1918         proxies=proxies,
   1919         resume_size=resume_size,
   1920         headers=headers,
   1921         expected_size=expected_size,
   1922     )
   1924 logger.info(f"Download complete. Moving file to {destination_path}")
   1925 _chmod_and_move(incomplete_path, destination_path)

File ~\AppData\Roaming\Python\Python311\site-packages\huggingface_hub\file_download.py:549, in http_get(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)
    547 new_resume_size = resume_size
    548 try:
--> 549     for chunk in r.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
    550         if chunk:  # filter out keep-alive new chunks
    551             progress.update(len(chunk))

File c:\Python311\Lib\site-packages\requests\models.py:822, in Response.iter_content.<locals>.generate()
    820     yield from self.raw.stream(chunk_size, decode_content=True)
    821 except ProtocolError as e:
--> 822     raise ChunkedEncodingError(e)
    823 except DecodeError as e:
    824     raise ContentDecodingError(e)

ChunkedEncodingError: ('Connection broken: IncompleteRead(147938459 bytes read, 130273836 more expected)', IncompleteRead(147938459 bytes read, 130273836 more expected))

set up similarity function

In [ ]:

  Copied!     
 
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

visualise embeddings

In [ ]:

embed chunks

In [ ]:

store the vectors/embedding

In [ ]:

Retrival¶

set up vector store

In [ ]:

similarity search

get top K results

In [ ]:

Augmentation¶

Augmenting Prompt

In [ ]:

modifying system prompt

Generation¶

set up llm provider

generate response