from llama import BasicModelRunner
# Try Non-Finetuned models
non_finetuned = BasicModelRunner("meta-llama/Llama-2-7b-hf")
non_finetuned_output = non_finetuned("Tell me how to train my dog to sit")
print(non_finetuned_output)
print(non_finetuned("What do you think of Mars?"))
print(non_finetuned("taylor swift's best friend"))
print(non_finetuned("""Agent: I'm here to help you with your Amazon deliver order.
Customer: I didn't get my item
Agent: I'm sorry to hear that. Which item was it?
Customer: the blanket
Agent:"""))
# Compare to finetuned models
finetuned_model = BasicModelRunner("meta-llama/Llama-2-7b-chat-hf")
finetuned_output = finetuned_model("Tell me how to train my dog to sit")
print(finetuned_output)
print(finetuned_model("[INST]Tell me how to train my dog to sit[/INST]"))
print(non_finetuned("[INST]Tell me how to train my dog to sit[/INST]"))
print(finetuned_model("What do you think of Mars?"))
print(finetuned_model("taylor swift's best friend"))
print(finetuned_model("""Agent: I'm here to help you with your Amazon deliver order.
Customer: I didn't get my item
Agent: I'm sorry to hear that. Which item was it?
Customer: the blanket
Agent:"""))
# Compare to ChatGPT
chatgpt = BasicModelRunner("chat-gpt")
print(chatgpt("Tell me how to train my dog to sit"))
import jsonlines
import itertools
import pandas as pd
from pprint import pprint
import datasets
from datasets import load_dataset
# dataset https://huggingface.co/datasets/c4
#pretrained_dataset = load_dataset("EleutherAI/pile", split="train", streaming=True)
pretrained_dataset = load_dataset("c4", "en", split="train", streaming=True)
n = 5
print("Pretrained dataset:")
top_n = itertools.islice(pretrained_dataset, n)
for i in top_n:
print(i)
# Contrast with company finetuning dataset you will be using
filename = "lamini_docs.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
instruction_dataset_df
# Various ways of formatting your data
examples = instruction_dataset_df.to_dict()
text = examples["question"][0] + examples["answer"][0]
text
if "question" in examples and "answer" in examples:
text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
text = examples["input"][0] + examples["output"][0]
else:
text = examples["text"][0]
prompt_template_qa = """### Question:
{question}
### Answer:
{answer}"""
question = examples["question"][0]
answer = examples["answer"][0]
text_with_prompt_template = prompt_template_qa.format(question=question, answer=answer)
text_with_prompt_template
prompt_template_q = """### Question:
{question}
### Answer:"""
num_examples = len(examples["question"])
finetuning_dataset_text_only = []
finetuning_dataset_question_answer = []
for i in range(num_examples):
question = examples["question"][i]
answer = examples["answer"][i]
text_with_prompt_template_qa = prompt_template_qa.format(question=question, answer=answer)
finetuning_dataset_text_only.append({"text": text_with_prompt_template_qa})
text_with_prompt_template_q = prompt_template_q.format(question=question)
finetuning_dataset_question_answer.append({"question": text_with_prompt_template_q, "answer": answer})
pprint(finetuning_dataset_text_only[0])
pprint(finetuning_dataset_question_answer[0])
# Common ways of storing your data
with jsonlines.open(f'lamini_docs_processed.jsonl', 'w') as writer:
writer.write_all(finetuning_dataset_question_answer)
finetuning_dataset_name = "lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_name)
print(finetuning_dataset)
import itertools
import jsonlines
from datasets import load_dataset
from pprint import pprint
from llama import BasicModelRunner
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Load instruction tuned dataset
instruction_tuned_dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True)
m = 5
print("Instruction-tuned dataset:")
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for j in top_m:
print(j)
# Two prompt templates
prompt_template_with_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
prompt_template_without_input = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
# Hydrate prompts (add data to prompts)
processed_data = []
for j in top_m:
if not j["input"]:
processed_prompt = prompt_template_without_input.format(instruction=j["instruction"])
else:
processed_prompt = prompt_template_with_input.format(instruction=j["instruction"], input=j["input"])
processed_data.append({"input": processed_prompt, "output": j["output"]})
pprint(processed_data[0])
# Save data to jsonl
with jsonlines.open(f'alpaca_processed.jsonl', 'w') as writer:
writer.write_all(processed_data)
# Compare non-instruction-tuned vs. instruction-tuned models
dataset_path_hf = "lamini/alpaca"
dataset_hf = load_dataset(dataset_path_hf)
print(dataset_hf)
non_instruct_model = BasicModelRunner("meta-llama/Llama-2-7b-hf")
non_instruct_output = non_instruct_model("Tell me how to train my dog to sit")
print("Not instruction-tuned output (Llama 2 Base):", non_instruct_output)
instruct_model = BasicModelRunner("meta-llama/Llama-2-7b-chat-hf")
instruct_output = instruct_model("Tell me how to train my dog to sit")
print("Instruction-tuned output (Llama 2): ", instruct_output)
chatgpt = BasicModelRunner("chat-gpt")
instruct_output_chatgpt = chatgpt("Tell me how to train my dog to sit")
print("Instruction-tuned output (ChatGPT): ", instruct_output_chatgpt)
# Try smaller models
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
# Tokenize
input_ids = tokenizer.encode(
text,
return_tensors="pt",
truncation=True,
max_length=max_input_tokens
)
# Generate
device = model.device
generated_tokens_with_prompt = model.generate(
input_ids=input_ids.to(device),
max_length=max_output_tokens
)
# Decode
generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
# Strip the prompt
generated_text_answer = generated_text_with_prompt[0][len(text):]
return generated_text_answer
test_sample = finetuning_dataset["test"][0]
print(test_sample)
print(inference(test_sample["question"], model, tokenizer))
# Compare to finetuned small model
instruction_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
print(inference(test_sample["question"], instruction_model, tokenizer))
# Pssst! If you were curious how to upload your own dataset to Huggingface
# Here is how we did it
# !pip install huggingface_hub
# !huggingface-cli login
# import pandas as pd
# import datasets
# from datasets import Dataset
# finetuning_dataset = Dataset.from_pandas(pd.DataFrame(data=finetuning_dataset))
# finetuning_dataset.push_to_hub(dataset_path_hf)
# Technically, it's only a few lines of code to run on GPUs (elsewhere, ie. on Lamini).
from llama import BasicModelRunner
model = BasicModelRunner("EleutherAI/pythia-410m")
model.load_data_from_jsonlines("lamini_docs.jsonl", input_key="question", output_key="answer")
model.train(is_public=True)
%% 1. Choose base model.
2. Load data.
3. Train it. Returns a model ID, dashboard, and playground interface. %%
# Let's look under the hood at the core code running this! This is the open core of Lamini's `llama` library :)
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from utilities import * # tokenlizer
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
logger = logging.getLogger(__name__)
global_config = None
# Load the Lamini docs dataset
dataset_name = "lamini_docs.jsonl"
dataset_path = f"/content/{dataset_name}"
use_hf = False
dataset_path = "lamini/lamini_docs"
use_hf = True
# Set up the model, training config, and tokenizer
model_name = "EleutherAI/pythia-70m"
training_config = {
"model": {
"pretrained_name": model_name,
"max_length" : 2048
},
"datasets": {
"use_hf": use_hf,
"path": dataset_path
},
"verbose": True
}
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)
print(train_dataset)
print(test_dataset)
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device_count = torch.cuda.device_count()
if device_count > 0:
logger.debug("Select GPU device")
device = torch.device("cuda")
else:
logger.debug("Select CPU device")
device = torch.device("cpu")
base_model.to(device)
# Define function to carry out inference
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
# Tokenize
input_ids = tokenizer.encode(
text,
return_tensors="pt",
truncation=True,
max_length=max_input_tokens
)
# Generate
device = model.device
generated_tokens_with_prompt = model.generate(
input_ids=input_ids.to(device),
max_length=max_output_tokens
)
# Decode
generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
# Strip the prompt
generated_text_answer = generated_text_with_prompt[0][len(text):]
return generated_text_answer
# Try the base model
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))
# Setup training
max_steps = 3
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name
training_args = TrainingArguments(
# Learning rate
learning_rate=1.0e-5,
# Number of training epochs
num_train_epochs=1,
# Max steps to train for (each step is a batch of data)
# Overrides num_train_epochs, if not -1
max_steps=max_steps,
# Batch size for training
per_device_train_batch_size=1,
# Directory to save model checkpoints
output_dir=output_dir,
# Other arguments
overwrite_output_dir=False, # Overwrite the content of the output directory
disable_tqdm=False, # Disable progress bars
eval_steps=120, # Number of update steps between two evaluations
save_steps=120, # After # steps model is saved
warmup_steps=1, # Number of warmup steps for learning rate scheduler
per_device_eval_batch_size=1, # Batch size for evaluation
evaluation_strategy="steps",
logging_strategy="steps",
logging_steps=1,
optim="adafactor",
gradient_accumulation_steps = 4,
gradient_checkpointing=False,
# Parameters for early stopping
load_best_model_at_end=True,
save_total_limit=1,
metric_for_best_model="eval_loss",
greater_is_better=False
)
model_flops = (
base_model.floating_point_ops(
{
"input_ids": torch.zeros(
(1, training_config["model"]["max_length"])
)
}
)
* training_args.gradient_accumulation_steps
)
print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")
trainer = Trainer(
model=base_model,
model_flops=model_flops,
total_steps=max_steps,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
)
# Train a few steps
training_output = trainer.train()
# Save model locally
save_dir = f'{output_dir}/final'
trainer.save_model(save_dir)
print("Saved model to:", save_dir)
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)
# Run slightly trained model
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)
print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)
# Run same model trained for two epochs
finetuned_longer_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
tokenizer = AutoTokenizer.from_pretrained("lamini/lamini_docs_finetuned")
finetuned_longer_model.to(device)
print("Finetuned longer model's answer: ")
print(inference(test_question, finetuned_longer_model, tokenizer))
# Run much larger trained model and explore moderation
bigger_finetuned_model = BasicModelRunner(model_name_to_id["bigger_model_name"])
bigger_finetuned_output = bigger_finetuned_model(test_question)
print("Bigger (2.8B) finetuned model (test): ", bigger_finetuned_output)
count = 0
for i in range(len(train_dataset)):
if "keep the discussion relevant to Lamini" in train_dataset[i]["answer"]:
print(i, train_dataset[i]["question"], train_dataset[i]["answer"])
count += 1
print(count)
# Explore moderation using small model
base_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
base_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
print(inference("What do you think of Mars?", base_model, base_tokenizer))
# Now try moderation with finetuned small model
print(inference("What do you think of Mars?", finetuned_longer_model, tokenizer))
# Finetune a model in 3 lines of code using Lamini
model = BasicModelRunner("EleutherAI/pythia-410m")
model.load_data_from_jsonlines("lamini_docs.jsonl", input_key="question", output_key="answer")
model.train(is_public=True)
out = model.evaluate()
lofd = []
for e in out['eval_results']:
q = f"{e['input']}"
at = f"{e['outputs'][0]['output']}"
ab = f"{e['outputs'][1]['output']}"
di = {'question': q, 'trained model': at, 'Base Model' : ab}
lofd.append(di)
df = pd.DataFrame.from_dict(lofd)
style_df = df.style.set_properties(**{'text-align': 'left'})
style_df = style_df.set_properties(**{"vertical-align": "text-top"})
style_df