What are Embeddings

Understanding Embedding Models and Similarity¶

In [ ]:

  Copied!     
 
!pip install -q sentence-transformers
!pip install -q wikipedia-api
!pip install -q numpy
!pip install -q scipy
!pip install rich
!pip install pypdf2
!pip install -q sentence-transformers !pip install -q wikipedia-api !pip install -q numpy !pip install -q scipy !pip install rich !pip install pypdf2

In [ ]:

  Copied!     
 
import re
import os
from rich import print
from sentence_transformers import SentenceTransformer
import numpy as np
import textwrap
from IPython.display import display, HTML
import re import os from rich import print from sentence_transformers import SentenceTransformer import numpy as np import textwrap from IPython.display import display, HTML

Load Data

In [ ]:

  Copied!     
 
from wikipediaapi import Wikipedia
wiki = Wikipedia('RAGBot/0.0', 'en')
data = wiki.page('Hayao_Miyazaki').text

## After Uploading a pdf
# data = load_document("/content/R_Tamil_LLama.pdf")

print(data)
from wikipediaapi import Wikipedia wiki = Wikipedia('RAGBot/0.0', 'en') data = wiki.page('Hayao_Miyazaki').text ## After Uploading a pdf # data = load_document("/content/R_Tamil_LLama.pdf") print(data)

Perform Chunking

In [ ]:

  Copied!     
 
def chunk_text(text, chunk_size=1000, overlap=20):
    """
    Split the text into chunks based on the number of words and word overlap.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunked_data = chunk_text(data)

print("Total number of chunks", len(chunked_data))
def chunk_text(text, chunk_size=1000, overlap=20): """ Split the text into chunks based on the number of words and word overlap. """ words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) return chunks chunked_data = chunk_text(data) print("Total number of chunks", len(chunked_data))

Visualise Chunking

In [ ]:

  Copied!     
 
def print_chunks(chunks):
    """
    Display text chunks in a clean, readable format using HTML styling.

    Args:
        chunks (list): List of text chunks to display
    """
    # Create the HTML for the chunks display
    html_content = """
    <style>
        .chunk-container {
            font-family: Arial, sans-serif;
            margin: 20px 0;
        }
        .chunk-header {
            background-color: #f0f2f6;
            padding: 5px 10px;
            border-radius: 5px 5px 0 0;
            border-left: 4px solid #3498db;
            font-weight: bold;
            color: #2c3e50;
        }
        .chunk-content {
            background-color: #ffffff;
            color: #2c3e50;
            padding: 10px;
            border: 1px solid #e1e4e8;
            border-left: 4px solid #3498db;
            border-top: none;
            border-radius: 0 0 5px 5px;
            white-space: pre-wrap;
            font-family: monospace;
        }
    </style>
    """

    # Add each chunk to the HTML content
    for i, chunk in enumerate(chunks, 1):
        # Wrap text for better readability
        wrapped_text = textwrap.fill(chunk, width=100)

        html_content += f"""
        <div class="chunk-container">
            <div class="chunk-header">Chunk {i}</div>
            <div class="chunk-content">{wrapped_text}</div>
        </div>
        """

    # Display the HTML
    display(HTML(html_content))
print_chunks(chunked_data)
def print_chunks(chunks): """ Display text chunks in a clean, readable format using HTML styling. Args: chunks (list): List of text chunks to display """ # Create the HTML for the chunks display html_content = """  """ # Add each chunk to the HTML content for i, chunk in enumerate(chunks, 1): # Wrap text for better readability wrapped_text = textwrap.fill(chunk, width=100) html_content += f""" Chunk {i}
{wrapped_text}
 """ # Display the HTML display(HTML(html_content)) print_chunks(chunked_data)

Setting Up embedding model

In [ ]:

  Copied!     
 
# Load the sentence transformer model for embeddings

model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)
# model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True)
# model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
# Load the sentence transformer model for embeddings model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True) # model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True) # model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)

set up similarity function

Understanding Cosine Similarity : refrence video

In [ ]:

  Copied!     
 
def cosine_similarity(vector_a, vector_b):
    """
    Calculate the cosine similarity between two vectors.
    Cosine similarity measures how similar two vectors are by calculating the cosine of the angle between them.

    Args:
        vector_a: First vector (numpy array)
        vector_b: Second vector (numpy array)

    Returns:
        float: Similarity score between -1 and 1
               1: Vectors are identical
               0: Vectors are perpendicular
              -1: Vectors are opposite
    """
    # Step 1: Calculate the dot product between the vectors
    # Dot product measures how much vectors point in the same direction
    dot_product = np.dot(vector_a, vector_b)

    # Step 2: Calculate the magnitude (length) of each vector
    # Magnitude is the square root of the sum of squared values
    magnitude_a = np.linalg.norm(vector_a)  # √(a1² + a2² + ... + an²)
    magnitude_b = np.linalg.norm(vector_b)  # √(b1² + b2² + ... + bn²)

    # Step 3: Calculate the cosine similarity
    # Divide dot product by the product of magnitudes
    similarity = dot_product / (magnitude_a * magnitude_b)

    return similarity
def cosine_similarity(vector_a, vector_b): """ Calculate the cosine similarity between two vectors. Cosine similarity measures how similar two vectors are by calculating the cosine of the angle between them. Args: vector_a: First vector (numpy array) vector_b: Second vector (numpy array) Returns: float: Similarity score between -1 and 1 1: Vectors are identical 0: Vectors are perpendicular -1: Vectors are opposite """ # Step 1: Calculate the dot product between the vectors # Dot product measures how much vectors point in the same direction dot_product = np.dot(vector_a, vector_b) # Step 2: Calculate the magnitude (length) of each vector # Magnitude is the square root of the sum of squared values magnitude_a = np.linalg.norm(vector_a) # √(a1² + a2² + ... + an²) magnitude_b = np.linalg.norm(vector_b) # √(b1² + b2² + ... + bn²) # Step 3: Calculate the cosine similarity # Divide dot product by the product of magnitudes similarity = dot_product / (magnitude_a * magnitude_b) return similarity

understanding similarity between two sentences

In [ ]:

  Copied!     
 
## Change the sentences accordingly

sentence1 = "The cat sat on the mat"
sentence2 = "A cat is sitting on a mat"
## Change the sentences accordingly sentence1 = "The cat sat on the mat" sentence2 = "A cat is sitting on a mat"

In [ ]:

  Copied!     
 
def get_similarity_score(sentence1, sentence2):
    """
    Calculate similarity score between two sentences.

    Args:
        sentence1 (str): First sentence
        sentence2 (str): Second sentence

    Returns:
        float: Similarity score between 0 and 1
    """
    # Get embeddings
    embedding1 = model.encode(sentence1, normalize_embeddings=True)
    embedding2 = model.encode(sentence2, normalize_embeddings=True)

    # Calculate similarity
    similarity = cosine_similarity(embedding1, embedding2)

    return similarity

# change the sentences


score = get_similarity_score(sentence1, sentence2)
print(f"Similarity score: {score:.4f}")
def get_similarity_score(sentence1, sentence2): """ Calculate similarity score between two sentences. Args: sentence1 (str): First sentence sentence2 (str): Second sentence Returns: float: Similarity score between 0 and 1 """ # Get embeddings embedding1 = model.encode(sentence1, normalize_embeddings=True) embedding2 = model.encode(sentence2, normalize_embeddings=True) # Calculate similarity similarity = cosine_similarity(embedding1, embedding2) return similarity # change the sentences score = get_similarity_score(sentence1, sentence2) print(f"Similarity score: {score:.4f}")

visualise embeddings

In [ ]:

  Copied!     
 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

def visualize_embeddings(sentence1, sentence2):
    """
    Visualize the relationship between two sentence embeddings using
    multiple visualization techniques.

    Args:
        sentence1 (str): First sentence
        sentence2 (str): Second sentence
    """
    # Get embeddings
    embedding1 = model.encode(sentence1, normalize_embeddings=True)
    embedding2 = model.encode(sentence2, normalize_embeddings=True)
    dimensions = range(len(embedding1))

    # Create figure with subplots
    fig = plt.figure(figsize=(15, 5))


    # Dimension-wise Comparison
    plt.subplot(132)
    plt.plot(dimensions, embedding1,
             label=f'Sentence 1: "{sentence1[:30]}..."',
             alpha=0.7,
             linewidth=1)
    plt.plot(dimensions, embedding2,
             label=f'Sentence 2: "{sentence2[:30]}..."',
             alpha=0.7,
             linewidth=1)
    plt.title('Comparison')
    plt.legend()
    plt.grid(True)

    # 2D PCA Projection
    plt.subplot(133)
    # Combine embeddings and apply PCA
    combined_embeddings = np.vstack([embedding1, embedding2])
    pca = PCA(n_components=2)
    projected = pca.fit_transform(combined_embeddings)

    plt.scatter(projected[0, 0], projected[0, 1], c='blue', label='Sentence 1', s=100)
    plt.scatter(projected[1, 0], projected[1, 1], c='red', label='Sentence 2', s=100)
    plt.plot([projected[0, 0], projected[1, 0]],
             [projected[0, 1], projected[1, 1]],
             'k--', alpha=0.5)
    plt.title('2D PCA Projection')
    plt.legend()
    plt.grid(True)

    # Add overall title and adjust layout
    plt.suptitle(f'Embedding Relationship Analysis\n"{sentence1}" vs "{sentence2}"',
                 fontsize=12, y=1.05)
    plt.tight_layout()

    # Calculate and display similarity score
    similarity = np.dot(embedding1, embedding2)
    print(f"Similarity Score: {similarity:.4f}")

    plt.show()

def plot_embedding_heatmap(sentence1, sentence2):
    """
    Create an improved heatmap visualization of embedding similarities.

    Args:
        sentence1 (str): First sentence
        sentence2 (str): Second sentence
    """
    # Get embeddings
    embedding1 = model.encode(sentence1, normalize_embeddings=True)
    embedding2 = model.encode(sentence2, normalize_embeddings=True)

    # Reshape embeddings to 2D matrices for better visualization
    size = int(np.sqrt(len(embedding1)))
    matrix1 = embedding1[:size*size].reshape(size, size)
    matrix2 = embedding2[:size*size].reshape(size, size)

    # Create similarity matrix
    similarity_matrix = np.dot(matrix1, matrix2.T)

    # Plot setup
    plt.figure(figsize=(12, 5))

    # Create subplots for both individual embeddings and their similarity
    plt.subplot(131)
    sns.heatmap(matrix1,
                cmap='viridis',
                center=0,
                cbar_kws={'label': 'Embedding Values'})
    plt.title(f'Embedding 1\n"{sentence1[:20]}..."')

    plt.subplot(132)
    sns.heatmap(matrix2,
                cmap='viridis',
                center=0,
                cbar_kws={'label': 'Embedding Values'})
    plt.title(f'Embedding 2\n"{sentence2[:20]}..."')

    plt.subplot(133)
    sns.heatmap(similarity_matrix,
                cmap='coolwarm',
                center=0,
                cbar_kws={'label': 'Similarity'})
    plt.title('Similarity Matrix')

    # Calculate overall similarity score
    similarity = np.dot(embedding1, embedding2)

    # Add overall title with similarity score
    plt.suptitle(f'Embedding Analysis (Similarity Score: {similarity:.4f})',
                 y=1.05)

    plt.tight_layout()
    plt.show()

    return similarity

visualize_embeddings(sentence1, sentence2)
plot_embedding_heatmap(sentence1, sentence2)
import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA def visualize_embeddings(sentence1, sentence2): """ Visualize the relationship between two sentence embeddings using multiple visualization techniques. Args: sentence1 (str): First sentence sentence2 (str): Second sentence """ # Get embeddings embedding1 = model.encode(sentence1, normalize_embeddings=True) embedding2 = model.encode(sentence2, normalize_embeddings=True) dimensions = range(len(embedding1)) # Create figure with subplots fig = plt.figure(figsize=(15, 5)) # Dimension-wise Comparison plt.subplot(132) plt.plot(dimensions, embedding1, label=f'Sentence 1: "{sentence1[:30]}..."', alpha=0.7, linewidth=1) plt.plot(dimensions, embedding2, label=f'Sentence 2: "{sentence2[:30]}..."', alpha=0.7, linewidth=1) plt.title('Comparison') plt.legend() plt.grid(True) # 2D PCA Projection plt.subplot(133) # Combine embeddings and apply PCA combined_embeddings = np.vstack([embedding1, embedding2]) pca = PCA(n_components=2) projected = pca.fit_transform(combined_embeddings) plt.scatter(projected[0, 0], projected[0, 1], c='blue', label='Sentence 1', s=100) plt.scatter(projected[1, 0], projected[1, 1], c='red', label='Sentence 2', s=100) plt.plot([projected[0, 0], projected[1, 0]], [projected[0, 1], projected[1, 1]], 'k--', alpha=0.5) plt.title('2D PCA Projection') plt.legend() plt.grid(True) # Add overall title and adjust layout plt.suptitle(f'Embedding Relationship Analysis\n"{sentence1}" vs "{sentence2}"', fontsize=12, y=1.05) plt.tight_layout() # Calculate and display similarity score similarity = np.dot(embedding1, embedding2) print(f"Similarity Score: {similarity:.4f}") plt.show() def plot_embedding_heatmap(sentence1, sentence2): """ Create an improved heatmap visualization of embedding similarities. Args: sentence1 (str): First sentence sentence2 (str): Second sentence """ # Get embeddings embedding1 = model.encode(sentence1, normalize_embeddings=True) embedding2 = model.encode(sentence2, normalize_embeddings=True) # Reshape embeddings to 2D matrices for better visualization size = int(np.sqrt(len(embedding1))) matrix1 = embedding1[:size*size].reshape(size, size) matrix2 = embedding2[:size*size].reshape(size, size) # Create similarity matrix similarity_matrix = np.dot(matrix1, matrix2.T) # Plot setup plt.figure(figsize=(12, 5)) # Create subplots for both individual embeddings and their similarity plt.subplot(131) sns.heatmap(matrix1, cmap='viridis', center=0, cbar_kws={'label': 'Embedding Values'}) plt.title(f'Embedding 1\n"{sentence1[:20]}..."') plt.subplot(132) sns.heatmap(matrix2, cmap='viridis', center=0, cbar_kws={'label': 'Embedding Values'}) plt.title(f'Embedding 2\n"{sentence2[:20]}..."') plt.subplot(133) sns.heatmap(similarity_matrix, cmap='coolwarm', center=0, cbar_kws={'label': 'Similarity'}) plt.title('Similarity Matrix') # Calculate overall similarity score similarity = np.dot(embedding1, embedding2) # Add overall title with similarity score plt.suptitle(f'Embedding Analysis (Similarity Score: {similarity:.4f})', y=1.05) plt.tight_layout() plt.show() return similarity visualize_embeddings(sentence1, sentence2) plot_embedding_heatmap(sentence1, sentence2)

embed chunks

In [ ]:

  Copied!     
 
def simple_visualize_chunks(chunks):
    """
    Create a simple 2D visualization of text chunk relationships.

    Args:
        chunks (list): List of text chunks to visualize
    """
    # Get embeddings and reduce dimensions
    embeddings = model.encode(chunks, normalize_embeddings=True, show_progress_bar=True)
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)

    # Create plot
    plt.figure(figsize=(10, 6))
    plt.scatter(reduced[:, 0], reduced[:, 1], c=range(len(chunks)), cmap='viridis')

    # Add labels
    for i, (x, y) in enumerate(reduced):
        plt.annotate(f"Chunk {i+1}", (x, y), xytext=(5, 5), textcoords='offset points')

    plt.title("Text Chunks in 2D Space")
    plt.grid(True, alpha=0.3)
    plt.colorbar(label='Chunk Order')

    plt.tight_layout()
    plt.show()

# Example usage:
# chunked_data = ["hello", "bird", "how are you doing" , "king"]
simple_visualize_chunks(chunked_data)
def simple_visualize_chunks(chunks): """ Create a simple 2D visualization of text chunk relationships. Args: chunks (list): List of text chunks to visualize """ # Get embeddings and reduce dimensions embeddings = model.encode(chunks, normalize_embeddings=True, show_progress_bar=True) pca = PCA(n_components=2) reduced = pca.fit_transform(embeddings) # Create plot plt.figure(figsize=(10, 6)) plt.scatter(reduced[:, 0], reduced[:, 1], c=range(len(chunks)), cmap='viridis') # Add labels for i, (x, y) in enumerate(reduced): plt.annotate(f"Chunk {i+1}", (x, y), xytext=(5, 5), textcoords='offset points') plt.title("Text Chunks in 2D Space") plt.grid(True, alpha=0.3) plt.colorbar(label='Chunk Order') plt.tight_layout() plt.show() # Example usage: # chunked_data = ["hello", "bird", "how are you doing" , "king"] simple_visualize_chunks(chunked_data)

In [ ]: