# Some required base Libraries
import requests, json 

# PyTorch library (used as backend for transformers library)
import torch

# transformers library from Hugging Face to load, run, and use pretrained models
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ollama wrapper library for Ollama
import ollama

# Auxiliary LangChain libraries to integrate Ollama and Hugging Face Transformers
from langchain_huggingface import HuggingFacePipeline
from langchain_ollama import OllamaLLM

from src.utils.compute.gpu import *

# Select preferred device (GPU, if available; CPU otherwise); you can enfore the use of the CPU
device = select_device(force_cpu=False)

print("Available device: {}".format(device))

Available device: cuda:0

client = ollama.Client(host="http://localhost:11434")
#client = ollama.Client() # Also works since "http://localhost:11434" is the default

model =  "tinyllama:latest"
prompt = "How old are the pyramids of Giza?"

response = client.generate(model=model, prompt=prompt)

print(response['response'])

The Pyramids of Giza, including the Great Pyramid and the Pyramid of Khufu, were built over two thousand years ago during the Old Kingdom (2570-2184 BCE) of ancient Egypt. They are estimated to be around 4,500 years old, which makes them among the oldest extant manmade structures on Earth. The Great Pyramid of Giza was completed around 2569 BCE and is believed to have been the largest pyramid in the world at its construction.

# Ensure you have a chat-capable model pulled and running
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I am doing well, thank you! How can I help you today?"},
    {"role": "user", "content": "How old are the pyramids of Giza?"}
]

reply = ollama.chat(model=model, messages=messages)

print(reply['message']['content'])

The pyramids of Giza are believed to have been built during the reign of King Menkaure (2586-2541 BCE), who ruled over Egypt between 2573 and 2549 BCE. The exact age of the pyramids themselves is not known, but they were constructed over several centuries during King Menkaure's reign. Archaeological research has suggested that they could have been built as far back as the late Predynastic period (c. 3100-3000 BCE).

response = client.list()

for model in response.models:
    print(f"{model.model} (size={model.size/(1024*1024*1024):.3f} GB, #parameters: {model.details.parameter_size})")

gemma3:1b (size=0.759 GB, #parameters: 999.89M)
tinyllama:latest (size=0.594 GB, #parameters: 1B)

# Specify the model name from Hugging Face Hub
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float32,  # Use float16 only if you hava a supported GPU with enough memory
    device_map="auto"
)

# Load the tokenizer associated with model
tokenizer = AutoTokenizer.from_pretrained(model_name)

example_sentence = "myoglobin is a protein"

# Tokenize the prompt and move it to the same device as the model (CPU or GPU)
inputs = tokenizer([example_sentence], return_tensors="pt").to(model.device)

# Print the resulting token ids
print(inputs.input_ids[0])

tensor([    1,   590,   468,   417,  2109,   338,   263, 26823],
       device='cuda:0')

tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

['<s>', '▁my', 'og', 'lo', 'bin', '▁is', '▁a', '▁protein']

tokenizer.decode(inputs.input_ids[0])

'<s> myoglobin is a protein'

tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)

'myoglobin is a protein'

# Define prompt
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I am doing well, thank you! How can I help you today?"},
    {"role": "user", "content": "How old are the pyramids of Giza?"}
]

# Apply prompt template and convert to token ids
prompt_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

print(tokenizer.decode(prompt_ids[0]))

<|user|>
Hello, how are you?</s> 
<|assistant|>
I am doing well, thank you! How can I help you today?</s> 
<|user|>
How old are the pyramids of Giza?</s>

outputs = model.generate(
    prompt_ids,
    max_new_tokens=200,                     # Limit the number of tokens in the response
    do_sample=True,                         # Enable sampling for diversity
    temperature=0.7,                        # Sampling temperature; lower = more deterministic
)

# Decode the generated token IDs back into text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the assistant's reply (remove the prompt)
response_text = response.split("<|assistant|>")[-1].strip()

# Print the final response
print(f"{response_text}")

The pyramids of Giza date back to around 2500 BC, which is around 5,000 years ago. The oldest pyramid, the Great Pyramid of Khufu, was completed around 2580 BC and was estimated to be between 140,000 and 160,000 years old when it was built. The pyramids have been the subject of debate and research for centuries, and there is no definitive answer as to their age. The exact age of the pyramids is still the subject of scholarly debate, and archeological evidence is limited to the archaeological site itself.

# Initialize Ollama with  chosen model
ollama_llm = OllamaLLM(model="tinyllama:latest")

# Invoke the model with a query
response = ollama_llm.invoke("How tall are the pyramids of Giza?")

# Print LLM-generated response
print(response)

The pyramids of Giza, also known as Khamzeh-ye Ghaem or Sazman-e Pouya, are not officially named Pyramid I or II. They are actually pyramids that are located in Iran and are not directly associated with the ancient Egyptian civilization. The exact heights of these structures have been estimated at varying sizes, but they are generally thought to be around 15 meters tall. The exact dimensions of the pyramids have changed over time due to modifications made by later civilizations who wanted to enhance their status and reputation as well as add or subtract certain features.

# Load a pretrained model and tokenizer from Hugging Face
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a Hugging Face pipeline for text generation
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100
)

# Wrap the pipeline in a LangChain LLM object
huggingface_llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

Device set to use cuda:0

# Invoke the model with a query
response = huggingface_llm.invoke("How tall are the pyramids of Giza?")

# Print LLM-generated response
print(response)

How tall are the pyramids of Giza?

Model	CPU RAM (int8, quantized)	GPU VRAM (fp16)	Notes
3B (e.g., TinyLlama)	4–6 GB	2–3 GB	Works on most laptops
7B (e.g., Mistral)	12–16 GB	8–10 GB	Requires decent GPU or CPU + swap
13B (e.g., LLaMA-2)	24–30 GB	16–20 GB	Needs high-end consumer GPU or 64 GB RAM
65B (e.g., LLaMA-2)	64+ GB	48+ GB	Requires professional workstation/server

Using Pretrained LLMs Locally — A Starter Guide¶

Setting up the Notebook¶

Make Required Imports¶

Checking & Setting Computing Device¶

Preliminaries¶

Motivation: Cloud-Based APIs vs Local Inference¶

Limitations of Cloud-Based APIs¶

Data Privacy¶

Cost Control¶

Flexibility & Customization¶

Local Inference¶

Popular Tools & Frameworks¶

Ollama¶

Installation¶

Loading & Serving Models¶

Using Ollama in Python¶

Hugging Face¶

Loading Model¶

Loading Tokenizer¶

Use Model for Inferencing¶

LangChain¶

Other Popular Alternatives¶

Studio LM¶

Text Generation Web UI¶

Text Generation Inference (TGI)¶

Summary¶