From bd02b5ba451004a63e6bf38b73571b8f1d5e9638 Mon Sep 17 00:00:00 2001 From: Sewon Min Date: Sat, 15 Apr 2023 10:52:25 -0700 Subject: [PATCH] update test.py and warning about version --- .gitignore | 2 ++ README.md | 4 ++++ test.py | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4e9d11e..28a750c 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ my* *.pkl *.jsonl *.csv + +Makefile diff --git a/README.md b/README.md index 44eb762..a067efe 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,8 @@ pip install datasets==1.4.0 wget ``` Warning: we found that `datasets==1.4.0` is not compatible with Transformers version we use for training and inference. Please use a separate environement for data preprocessing and model training/inference. +**Updates on 04/15/2023**: We found that a few datasets are not supported anymore with `datasets==1.4.0`. Nonetheless, we found that the results reported in the paper are roughly reproducible with other versions of `datasets` as well (for instance, `2.11.0` which is the most recent version at the moment). If you found other versions of datasets do not reproduce results in the paper, please report it to us. + ``` cd preprocess # preprocess from crossfit @@ -194,6 +196,8 @@ Additional flags: If you want to use values that are different from default ones, please simply add the flag, e.g., `python _build_gym.py --build --n_proc=40 --do_test --test_k 4`. +**Updates on 04/15/2023**: If some datasets are not supported anymore, the script will print out a list of failed datasets at the end, along with command lines you can re-run to check the error messages. + #### Process instruction version The instruction version is for settings using instructions. We use instructions from [BigScience PromptSource][t0-repo]. First, fetch instructions (prompts) from PromptSource by doing the following. diff --git a/test.py b/test.py index 7634142..ffe3b61 100644 --- a/test.py +++ b/test.py @@ -188,7 +188,14 @@ def run(logger, task, metaicl_data, metaicl_model, train_data, dev_data, seed, prediction_path = prediction_path.replace(".txt", "-calibrated.txt") if os.path.exists(prediction_path): - return 0 + with open(prediction_path, "r") as f: + predictions = [] + for line in f: + predictions.append(line.strip()) + groundtruths = [dp["output"] for dp in dev_data] + perf = metaicl_data.evaluate(predictions, groundtruths, is_classification) + logger.info("Accuracy=%s" % perf) + return perf if os.path.exists(cache_path): with open(cache_path, "rb") as f: