import numpy as np
import pandas as pd
import faiss
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

from src.utils.data.files import *

articles, _ = download_dataset("/text/corpora/news/sciencenews-articles-sampled.csv")

File 'data/datasets//text/corpora/news/sciencenews-articles-sampled.csv' already exists (use 'overwrite=True' to overwrite it).

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

llm = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype="auto")

tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.eos_token

def generate_output(model, tokenizer, messages, max_length=100, temperature=0.01):
    # Apply prompt template and convert to token ids (important: add "<|assistant|>" as generation prompt)
    prompt_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
    # Use model to generate output
    outputs = model.generate(
        prompt_ids,
        max_new_tokens=max_length,              # Limit the number of tokens in the response
        do_sample=True,                         # Enable sampling for diversity
        temperature=temperature,                # Sampling temperature; lower = more deterministic
    )
    # Decode the generated token IDs back into text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract and return only the assistant's reply (remove the prompt)
    return response.split("<|assistant|>\n")[-1].strip()

query = "What were the wind speeds of hurricane Melissa?"
#query = "How many shark attacks have been in 2024 that ended deadly?"

messages = [
    {"role": "system", "content": "You are a helpful assisstant."},
    {"role": "user", "content": query},
]

# Use model to generate answer
answer = generate_output(llm, tokenizer, messages)

print(answer)

The wind speeds of Hurricane Melissa were estimated to be around 155 mph (250 km/h) on August 29, 1995, which was a Category 5 hurricane at the time.

df = pd.read_csv(articles)

print(f"Number of articles: {len(df)}")

Number of articles: 187

df.head()

def split_document(content, tokenizer, max_tokens=200, paragraph_separator="\n\n"):
    # Split document into a list of paragraphs
    paragraphs = [ p.strip() for p in content.split(paragraph_separator) ]
    chunks, chunk, token_count = [], [], 0
    for p in paragraphs:
        # Tokenize paragraph and encode to token ids
        tokens = tokenizer.encode(p)
        # Check if we can add paragraph to current chunk        
        if token_count + len(tokens) <= max_tokens:
            # If so, add paragraph to current chunk
            chunk.append(p)
            token_count += len(tokens)
        else:
            # I not, add current chunk to final list + create new chunk from paragraph
            chunks.append("\n\n".join(chunk))
            chunk = [ tokenizer.decode(tokens[:max_tokens], skip_special_tokens=True) ]
            token_count = len(tokens[:max_tokens])
    # Add eny remaining content as its own chunk
    if token_count > 0:
        chunks.append("\n\n".join(chunk))
    return chunks

chunks = []

for idx, row in df.iterrows():
    published, title, url, content = row
    article_chunks = split_document(content, tokenizer)
    for chunk in article_chunks:
        chunks.append({
            "text": chunk,
            "metadata": {"published": published, "title": title, "source": url}
        })

print(f"Total number of chunks: {len(chunks)}")

Total number of chunks: 1312

embed_model_id = "sentence-transformers/all-MiniLM-L6-v2"

embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Create embeddings
chunk_texts = [c["text"] for c in chunks]
chunk_embeddings = embed_model.encode(chunk_texts, convert_to_numpy=True, normalize_embeddings=True)

# Build FAISS index
dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(chunk_embeddings)

# Maintain side metadata mapping
metadata = {i: chunks[i]["metadata"] for i in range(len(chunks))}

query_vector = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
distances, indices = index.search(query_vector, 2)
print(indices)

print("Top matches:\n")
for rank, idx in enumerate(indices[0]):
    print(f"======== Chunk {rank+1} ========")
    print(f"{chunks[idx]['text']} (score: {distances[0][rank]:.4f})\n")
    print(f"   Metadata: {metadata[idx]}")
    print()

[[60 61]]
Top matches:

======== Chunk 1 ========
With winds whirling at about 290 kilometers per hour, Hurricane Melissa is one of the strongest ever recorded in the Atlantic Ocean — and is poised to become the strongest storm ever to make landfall in Jamaica. It's also a huge storm, with hurricane-force winds extending over 70 kilometers from its core. Hours before official landfall, heavy rains and battering winds had already begun to lash the island.

After the Category 5 storm roars ashore over Jamaica on October 28, its path will take it spinning over Cuba, Haiti and the Dominican Republic. Those in its path are bracing for catastrophic flash flooding and landslides, storm surge and waves, and intense winds powerful enough to destroy homes and infrastructure. (score: 0.6465)

   Metadata: {'published': '2025-10-28 10:02:47-04:00', 'title': 'Hurricane Melissa spins into a monster storm as it bears down on Jamaica', 'source': 'https://www.sciencenews.org/article/hurricane-melissa-monster-storm-jamaica'}

======== Chunk 2 ========
The story of this latest hurricane sounds all too familiar: The slow-moving storm was initially unfocused and disorganized, but two days of lingering over deep, warm ocean water gave it enough fuel to whip itself into a tightly spinning catastrophic force of nature, centered around a piercingly sharp eye.

Such rapid intensification of tropical storms into major hurricanes has become the norm as ocean temperatures continue to rise around the globe. Climate change models have projected that hurricanes will also move more slowly as the planet warms — not only giving the storms time to gather more energy from the hot water, but also to dump copious amounts of rain after landfall. Forecasters are projecting that Melissa is holding so much moisture that it could dump as much as a meter of rain on Jamaica. (score: 0.6238)

   Metadata: {'published': '2025-10-28 10:02:47-04:00', 'title': 'Hurricane Melissa spins into a monster storm as it bears down on Jamaica', 'source': 'https://www.sciencenews.org/article/hurricane-melissa-monster-storm-jamaica'}

def get_context(query, model, index, chunks, metadata, topk=2):
    # Embed query string to vector    
    query_vector = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    # Find indices most similar check embedding vectors
    distances, indices = index.search(query_vector, topk)
    # Create context as the combination of all relevant cunks
    context = "\n\n".join([ chunks[idx]["text"] for idx in indices[0] ])
    # Extract the corresponding sources for each chunk (remove duplicates, if needed)
    sources = set([ metadata[idx]["source"] for idx in indices[0] ])
    # Return context and sources
    return context, sources

context, sources = get_context(query, embed_model, index, chunks, metadata)

print(f"Context and sources for query '{query}'\n")
print(context)
print(sources)

Context and sources for query 'What were the wind speeds of hurricane Melissa?'

With winds whirling at about 290 kilometers per hour, Hurricane Melissa is one of the strongest ever recorded in the Atlantic Ocean — and is poised to become the strongest storm ever to make landfall in Jamaica. It's also a huge storm, with hurricane-force winds extending over 70 kilometers from its core. Hours before official landfall, heavy rains and battering winds had already begun to lash the island.

After the Category 5 storm roars ashore over Jamaica on October 28, its path will take it spinning over Cuba, Haiti and the Dominican Republic. Those in its path are bracing for catastrophic flash flooding and landslides, storm surge and waves, and intense winds powerful enough to destroy homes and infrastructure.

The story of this latest hurricane sounds all too familiar: The slow-moving storm was initially unfocused and disorganized, but two days of lingering over deep, warm ocean water gave it enough fuel to whip itself into a tightly spinning catastrophic force of nature, centered around a piercingly sharp eye.

Such rapid intensification of tropical storms into major hurricanes has become the norm as ocean temperatures continue to rise around the globe. Climate change models have projected that hurricanes will also move more slowly as the planet warms — not only giving the storms time to gather more energy from the hot water, but also to dump copious amounts of rain after landfall. Forecasters are projecting that Melissa is holding so much moisture that it could dump as much as a meter of rain on Jamaica.
{'https://www.sciencenews.org/article/hurricane-melissa-monster-storm-jamaica'}

messages = [
    {"role": "system", "content": "You are provided with the following context:"},
    {"role": "user", "content": context},
    {"role": "system", "content": "You are a helpful assisstant. Use the provided context to generate a short response. Explain how you derived the answer from the provided content"},
    {"role": "user", "content": query},
]

answer = generate_output(llm, tokenizer, messages)

print(answer)

The provided context states that Hurricane Melissa is one of the strongest ever recorded in the Atlantic Ocean, with winds whirling at about 290 kilometers per hour. The wind speeds of hurricane-force winds extending over 70 kilometers from its core are also mentioned. Therefore, the wind speeds of hurricane Melissa are estimated to be in the range of 290-300 kilometers per hour.

def generage_rag_output(query, embed_model, index, chunks, metadata, llm, tokenizer):
    # (1) Retrieval
    context, sources = get_context(query, embed_model, index, chunks, metadata)
    # (2) Augmentation
    messages = [
        {"role": "system", "content": "You are provided with the following context:"},
        {"role": "user", "content": context},
        {"role": "system", "content": "You are a helpful assisstant. Use the provided context to generate a short response. Explain how you derived the answer from the provided content"},
        {"role": "user", "content": query},
    ]
    # (3) Generation
    answer = generate_output(llm, tokenizer, messages)
    return answer, sources

query = "What were the wind speeds of hurricane Melissa?"
#query = "How many shark attacks have been in 2024 that ended deadly?"

answer, sources = generage_rag_output(query, embed_model, index, chunks, metadata, llm, tokenizer)

print(f"Query: '{query}'\n")
print(f"Answer:\n{answer}")
print(f"Sources: {sources}")

Query: 'What were the wind speeds of hurricane Melissa?'

Answer:
The provided context states that Hurricane Melissa is one of the strongest ever recorded in the Atlantic Ocean, with winds whirling at about 290 kilometers per hour. The wind speeds of hurricane-force winds extending over 70 kilometers from its core are also mentioned. Therefore, the wind speeds of hurricane Melissa are estimated to be in the range of 290-300 kilometers per hour.
Sources: {'https://www.sciencenews.org/article/hurricane-melissa-monster-storm-jamaica'}

query = "What is the capital of France?"
#query = "What are the main ingredients of bread?"

answer, sources = generage_rag_output(query, embed_model, index, chunks, metadata, llm, tokenizer)

print(f"Query: '{query}'\n")
print(f"Answer:\n{answer}")
print(f"Sources: {sources}")

Query: 'What is the capital of France?'

Answer:
The capital of France is Paris.
Sources: {'https://www.sciencenews.org/article/nasas-webb-telescope-moon-uranus', 'https://www.sciencenews.org/article/coral-collapse-climate-tipping-point'}

	PUBLISHED	TITLE	URL	CONTENT
0	2025-10-31 12:00:00-04:00	A new AI technique may aid violent crime foren...	https://www.sciencenews.org/article/ai-violent...	Crime scene clues from blowflies may help reve...
1	2025-10-31 10:00:00-04:00	Cancer treatments may get a boost from mRNA CO...	https://www.sciencenews.org/article/cancer-imm...	The mRNA COVID-19 vaccines might make some can...
2	2025-10-30 12:36:11-04:00	Nanotyrannus was not a teenaged T. rex	https://www.sciencenews.org/article/nanotyrann...	For decades, researchers have debated whether ...
3	2025-10-30 10:00:00-04:00	This flower smells like injured ants — and fli...	https://www.sciencenews.org/article/flower-emi...	A Japanese flower lures in its pollinators wit...
4	2025-10-29 12:43:40-04:00	Some planets might home brew their own water	https://www.sciencenews.org/article/planets-ma...	Some planets might produce their own water ins...

Retrieval-Augmented Generation (RAG) — A (Very) Basic Example¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Preliminaries¶

Quick Recap: Retrieval-Augmented Generation (RAG)¶

Loading & Testing the Pretrained LLM¶

Load Pretrained Model¶

Load Pretrained Tokenizer¶

Create Auxiliary Method to Prompt LLM¶

Testing the Model¶

Creating the Knowledge Repository¶

Source Documents¶

Chunking¶

Indexing¶

RAG-Based Prompting¶

Retrieval¶

Augmentation¶

Generation¶

Putting it all Together¶

Discussion¶

Summary¶