diff --git a/README.md b/README.md index 5b63999..3b99e8d 100644 --- a/README.md +++ b/README.md @@ -264,10 +264,14 @@ cd jigsaw_data # download data kaggle competitions download -c jigsaw-toxic-comment-classification-challenge +unzip jigsaw-toxic-comment-classification-challenge.zip -d jigsaw-toxic-comment-classification-challenge +find jigsaw-toxic-comment-classification-challenge -name '*.csv.zip' | xargs -n1 unzip -d jigsaw-toxic-comment-classification-challenge kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification +unzip jigsaw-unintended-bias-in-toxicity-classification.zip -d jigsaw-unintended-bias-in-toxicity-classification kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification +unzip jigsaw-multilingual-toxic-comment-classification.zip -d jigsaw-multilingual-toxic-comment-classification ``` ## Start Training diff --git a/preprocessing_utils.py b/preprocessing_utils.py index 3549744..437db69 100644 --- a/preprocessing_utils.py +++ b/preprocessing_utils.py @@ -1,18 +1,29 @@ import argparse +import logging +from pathlib import Path import numpy as np import pandas as pd +logger = logging.getLogger("preprocessing_utils") +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + level=logging.INFO, +) + def update_test(test_csv_file): """Combines disjointed test and labels csv files into one file.""" + test_csv_file = Path(test_csv_file) test_set = pd.read_csv(test_csv_file) - data_labels = pd.read_csv(test_csv_file[:-4] + "_labels.csv") + data_labels = pd.read_csv(str(test_csv_file)[:-4] + "_labels.csv") for category in data_labels.columns[1:]: test_set[category] = data_labels[category] if "content" in test_set.columns: test_set.rename(columns={"content": "comment_text"}, inplace=True) - test_set.to_csv(f"{test_csv_file.split('.csv')[0]}_updated.csv") + output_file = test_csv_file.parent / f"{test_csv_file.stem}_updated.csv" + test_set.to_csv(output_file) + logger.info("Updated test set saved to %s", output_file) return test_set @@ -20,12 +31,15 @@ def create_val_set(csv_file, val_fraction): """Takes in a csv file path and creates a validation set out of it specified by val_fraction. """ + csv_file = Path(csv_file) dataset = pd.read_csv(csv_file) np.random.seed(0) dataset_mod = dataset[dataset.toxic != -1] indices = np.random.rand(len(dataset_mod)) > val_fraction val_set = dataset_mod[~indices] - val_set.to_csv("val.csv") + output_file = csv_file.parent / "val.csv" + logger.info("Validation set saved to %s", output_file) + val_set.to_csv(output_file) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 4887b69..eed9a18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,10 +11,10 @@ classifiers = [ "Operating System :: OS Independent", "Programming Language :: Python :: 3", ] -requires-python = ">=3.9,<3.12" +requires-python = ">=3.9,<3.13" dependencies = [ "sentencepiece >= 0.1.94", - "torch < 2.2", + "torch >=2", "transformers >= 3", ] @@ -29,7 +29,7 @@ dev = [ "datasets >= 1.0.2", "pandas >= 1.1.2", "pytest", - "pytorch-lightning<2.0.0,>1.5.0", + "pytorch-lightning>2", "scikit-learn >= 0.23.2", "tqdm", "pre-commit", diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 1608de7..e12a824 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -1,6 +1,7 @@ import json import src.data_loaders as module_data + import torch from pytorch_lightning import seed_everything, Trainer from torch.utils.data import DataLoader @@ -37,7 +38,7 @@ def get_instance(module, name, config, *args, **kwargs): ) trainer = Trainer( - gpus=0 if torch.cuda.is_available() else None, + accelerator="gpu" if torch.cuda.is_available() else "cpu", limit_train_batches=2, limit_val_batches=2, max_epochs=1, diff --git a/train.py b/train.py index ac75615..674f96a 100644 --- a/train.py +++ b/train.py @@ -3,6 +3,7 @@ import os import pytorch_lightning as pl + import src.data_loaders as module_data import torch from pytorch_lightning.callbacks import ModelCheckpoint @@ -159,7 +160,7 @@ def cli_main(): "--device", default=None, type=str, - help="indices of GPUs to enable (default: None)", + help="comma-separated indices of GPUs to enable (default: None)", ) parser.add_argument( "--num_workers", @@ -208,16 +209,26 @@ def get_instance(module, name, config, *args, **kwargs): monitor="val_loss", mode="min", ) + + if args.device is None: + devices = "auto" + else: + devices = [int(d.strip()) for d in args.device.split(",")] + trainer = pl.Trainer( - gpus=args.device, + devices=devices, max_epochs=args.n_epochs, accumulate_grad_batches=config["accumulate_grad_batches"], callbacks=[checkpoint_callback], - resume_from_checkpoint=args.resume, default_root_dir="saved/" + config["name"], deterministic=True, ) - trainer.fit(model, data_loader, valid_data_loader) + trainer.fit( + model=model, + train_dataloaders=data_loader, + val_dataloaders=valid_data_loader, + ckpt_path=args.resume, + ) if __name__ == "__main__":