model_nm = "ProsusAI/finbert"

from transformers import AutoTokenizer

tokz = AutoTokenizer.from_pretrained(model_nm)

tokz.tokenize("Hi! I am Sami")

from datasets import load_dataset

dickens_ds = load_dataset("GuillermoTBB/charles-dickens-text-classification")
dickens_ds

train, test = dickens_ds['train'], dickens_ds['test']

train_df = train.to_pandas()
train_df

def tok_func(ds):
    return tokz(ds['text'])

tok_ds = train.map(tok_func, batched=True)
tok_ds

tok_ds_df = tok_ds.to_pandas()
tok_ds_df

import numpy as np
from numpy.random import normal, seed, uniform
np.random.seed(42)

dataloaders = tok_ds.train_test_split(0.25, seed=42)
dataloaders = dataloaders.rename_column('label',"labels")
dataloaders.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)
dataloaders

from transformers import TrainingArguments

bs = 4
epochs = 2
learning_rate = 0.1

args = TrainingArguments( 
    output_dir="dickens-model",
    learning_rate=learning_rate,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

def accuracy(prediction, label):
    pred_classes = np.argmax(prediction, axis=1)
    return np.mean(pred_classes == label)

def metrics(dataset):
    return {'accuracy': accuracy(*dataset)}

from transformers import AutoConfig, AutoModelForSequenceClassification, Trainer

config = AutoConfig.from_pretrained(
    model_nm,
    num_labels=2,
    id2label={0:"not_dickens",1:"dickens"},
    label2id={"not_dickens":0,"dickens":1},)

model = AutoModelForSequenceClassification.from_config(config)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataloaders['train'],
                  eval_dataset=dataloaders['test'],
                  tokenizer=tokz,
                  compute_metrics=metrics
                  )

trainer.train()

test = test.map(tok_func, batched=True)
test

preds = trainer.predict(test)
preds

model = AutoModelForSequenceClassification.from_pretrained("./dickens-model/checkpoint-165")
print("Head bias:", model.classifier.bias.data)

# using the pipeline API -- a general inference function
from transformers import pipeline

classifier = pipeline(task="text-classification", model="./dickens-model/checkpoint-165", tokenizer="./dickens-model/checkpoint-165", return_all_scores=True)

classifier("Joe was evidently made uncomfortable by what he supposed to be my loss of appetite, and took a thoughtful bite out of his slice, which he didn’t seem to enjoy. He turned it about in his mouth much longer than usual, pondering over it a good deal, and after all gulped it down like a pill. He was about to take another bite, and had just got his head on one side for a good purchase on it, when his eye fell on me, and he saw that my bread and butter was gone.")

NLP - RNNs, Transformers, Hugging Face¶

Language Models¶

Recurrent Neural Networks - RNNs¶

Transformers¶

Tokenization and Numericalization¶

Our Dataset¶

Training our model¶

Defining our metrics¶

Training our model¶