From bd02b5ba451004a63e6bf38b73571b8f1d5e9638 Mon Sep 17 00:00:00 2001
From: Sewon Min <sewon@klone-login01.hyak.local>
Date: Sat, 15 Apr 2023 10:52:25 -0700
Subject: [PATCH] update test.py and warning about version

---
 .gitignore | 2 ++
 README.md  | 4 ++++
 test.py    | 9 ++++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 4e9d11e..28a750c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,5 @@ my*
 *.pkl
 *.jsonl
 *.csv
+
+Makefile
diff --git a/README.md b/README.md
index 44eb762..a067efe 100644
--- a/README.md
+++ b/README.md
@@ -178,6 +178,8 @@ pip install datasets==1.4.0 wget
 ```
 Warning: we found that `datasets==1.4.0` is not compatible with Transformers version we use for training and inference. Please use a separate environement for data preprocessing and model training/inference.
 
+**Updates on 04/15/2023**: We found that a few datasets are not supported anymore with `datasets==1.4.0`. Nonetheless, we found that the results reported in the paper are roughly reproducible with other versions of `datasets` as well (for instance, `2.11.0` which is the most recent version at the moment). If you found other versions of datasets do not reproduce results in the paper, please report it to us.
+
 ```
 cd preprocess
 # preprocess from crossfit
@@ -194,6 +196,8 @@ Additional flags:
 
 If you want to use values that are different from default ones, please simply add the flag, e.g., `python _build_gym.py --build --n_proc=40 --do_test --test_k 4`.
 
+**Updates on 04/15/2023**: If some datasets are not supported anymore, the script will print out a list of failed datasets at the end, along with command lines you can re-run to check the error messages.
+
 #### Process instruction version
 The instruction version is for settings using instructions. We use instructions from [BigScience PromptSource][t0-repo]. First, fetch instructions (prompts) from PromptSource by doing the following.
 
diff --git a/test.py b/test.py
index 7634142..ffe3b61 100644
--- a/test.py
+++ b/test.py
@@ -188,7 +188,14 @@ def run(logger, task, metaicl_data, metaicl_model, train_data, dev_data, seed,
         prediction_path = prediction_path.replace(".txt", "-calibrated.txt")
 
     if os.path.exists(prediction_path):
-        return 0
+        with open(prediction_path, "r") as f:
+            predictions = []
+            for line in f:
+                predictions.append(line.strip())
+        groundtruths = [dp["output"] for dp in dev_data]
+        perf = metaicl_data.evaluate(predictions, groundtruths, is_classification)
+        logger.info("Accuracy=%s" % perf)
+        return perf
 
     if os.path.exists(cache_path):
         with open(cache_path, "rb") as f: