Merge pull request #6 from tboquet/tasknet-4-5/load-pipeline-improvem…

…ents tasknet-5-6/load-pipeline-improvements
sileod · Sep 19, 2023 · 35a5d36 · 35a5d36
2 parents 2d1c49e + 48d8e1a
commit 35a5d36
Showing 1 changed file with 124 additions and 65 deletions.
diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py
@@ -1,22 +1,32 @@
-from datasets import DatasetDict, Dataset, load_dataset
-from easydict import EasyDict as edict
 import copy
 import functools
-from tqdm.auto import tqdm
-from datasets import concatenate_datasets
+
 import funcy as fc
-import torch
 import magicattr
+import torch
+from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
+from easydict import EasyDict as edict
+from tqdm.auto import tqdm
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    TextClassificationPipeline,
+)
+
 
 class NoTqdm:
     def __enter__(self):
-        tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)    
+        tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)
+
     def __exit__(self, exc_type, exc_value, exc_traceback):
         tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=False)
 
+
 def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, seed=0):
     train_testvalid = dataset.train_test_split(test_size=1 - train_ratio, seed=seed)
-    test_valid = train_testvalid["test"].train_test_split(test_size=val_test_ratio, seed=seed)
+    test_valid = train_testvalid["test"].train_test_split(
+        test_size=val_test_ratio, seed=seed
+    )
     dataset = DatasetDict(
         train=train_testvalid["train"],
         validation=test_valid["test"],
@@ -25,32 +35,39 @@ def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, se
     return dataset
 
 
-def load_dataset_sample(*args,n=1000):
-    ds= load_dataset(*args,streaming=True)
-    return DatasetDict({k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds})
+def load_dataset_sample(*args, n=1000):
+    ds = load_dataset(*args, streaming=True)
+    return DatasetDict(
+        {k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds}
+    )
 
 
 def to_dict(x):
-    if hasattr(x,'items'):
+    if hasattr(x, "items"):
         return edict(x)
     else:
-        x=edict({a:getattr(x,a) for a in dir(x) if not a.startswith('__')})
+        x = edict({a: getattr(x, a) for a in dir(x) if not a.startswith("__")})
         return x
 
+
 def deep_copy_cache(function):
-  memo = {}
-  def wrapper(*args, **kwargs):
-    if args in memo:
-      return copy.deepcopy(memo[args])
-    else:
-      rv = function(*args, **kwargs)
-      memo[args] = rv
-      return rv
-  return wrapper
+    memo = {}
+
+    def wrapper(*args, **kwargs):
+        if args in memo:
+            return copy.deepcopy(memo[args])
+        else:
+            rv = function(*args, **kwargs)
+            memo[args] = rv
+            return rv
+
+    return wrapper
+
 
 def shallow_copy_A_to_B(A, B):
     """Shallow copy (=parameter sharing) A into B
-    https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427"""
+    https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
+    """
 
     def rsetattr(obj, attr, val):
         pre, _, post = attr.rpartition(".")
@@ -66,39 +83,43 @@ def _getattr(obj, attr):
         rsetattr(B, nb, rgetattr(A, na))
     return A, B
 
+
 def normalize_label(label):
-    label=str(label).lower()
-    label=label.replace('-','_')
-    label=label.replace(' ','_')
-    label=label.replace('entailed', 'entailment')
-    label=label.replace('non_','not_')
-    label=label.replace('duplicate','equivalent')
-    label=label.replace('neg','negative')
-    label=label.replace('pos','positive')
+    label = str(label).lower()
+    label = label.replace("-", "_")
+    label = label.replace(" ", "_")
+    label = label.replace("entailed", "entailment")
+    label = label.replace("non_", "not_")
+    label = label.replace("duplicate", "equivalent")
+    label = label.replace("neg", "negative")
+    label = label.replace("pos", "positive")
     return label
 
 
-def merge_tasks(tasks,names):
+def merge_tasks(tasks, names):
     prev, done, to_delete = dict(), dict(), []
-    for i,t in tqdm(enumerate(tasks)):
-        x=[x for x in names if x in t.name]
+    for i, t in tqdm(enumerate(tasks)):
+        x = [x for x in names if x in t.name]
         if x:
-            x=x[0]
-            columns=t.dataset['train'].features.keys()
-            n_choices = len([c for c in columns if 'choice' in c])
+            x = x[0]
+            columns = t.dataset["train"].features.keys()
+            n_choices = len([c for c in columns if "choice" in c])
             if n_choices:
-                x=f"{x}-{n_choices}"
+                x = f"{x}-{n_choices}"
             if x in prev:
-                t.dataset=DatasetDict(fc.merge_with(concatenate_datasets, prev[x], t.dataset))
-            prev[x]=t.dataset
-            t.name=x
-            done[x]=t
-            to_delete+=[i]
-    tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(done.values())
+                t.dataset = DatasetDict(
+                    fc.merge_with(concatenate_datasets, prev[x], t.dataset)
+                )
+            prev[x] = t.dataset
+            t.name = x
+            done[x] = t
+            to_delete += [i]
+    tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(
+        done.values()
+    )
     return tasks
 
 
-
 def nested_children(m: torch.nn.Module):
     children = dict(m.named_children())
     output = {}
@@ -107,57 +128,95 @@ def nested_children(m: torch.nn.Module):
     else:
         for name, child in children.items():
             if name.isnumeric():
-                name=f'[{name}]'
+                name = f"[{name}]"
             try:
                 output[name] = nested_children(child)
             except TypeError:
                 output[name] = nested_children(child)
     return output
 
+
 def convert(d):
     for k, v in d.items():
         if isinstance(v, dict):
-            yield from (f'{k}.{x}'.replace('.[','[') for x in convert(v))
+            yield from (f"{k}.{x}".replace(".[", "[") for x in convert(v))
         else:
             yield k
 
-def search_module(m,name, mode='attr', lowercase=True):
+
+def search_module(m, name, mode="attr", lowercase=True):
     paths = convert(nested_children(m))
-    module_name = lambda x: magicattr.get(m,x).__class__.__name__ 
+    module_name = lambda x: magicattr.get(m, x).__class__.__name__
     process = lambda x: x.lower() if lowercase else x
-    name=process(name)
-    if mode=='attr':
+    name = process(name)
+    if mode == "attr":
         return [x for x in paths if name in process(x)]
-    if mode=='class':
+    if mode == "class":
         return [x for x in paths if name in process(module_name(x))]
     else:
         raise ValueError('mode must be "attr" or "class"')
 
 
-def load_pipeline(model_name, task_name, adapt_task_embedding=True,multilingual=False):
-    if multilingual or 'mdeberta' in model_name:
-        multilingual=True 
+def load_pipeline(
+    model_name: str,
+    task_name: str,
+    adapt_task_embedding: bool = True,
+    multilingual: bool = False,
+    device: int = -1,
+    return_all_scores: bool = False,
+) -> TextClassificationPipeline:
+    """Load Text Classification Pipeline for a Specified Model.
+
+    Load a text classification pipeline for the specified model and task. If
+    the model is multilingual or has "mdeberta" in its name, it will handle
+    the multilingual settings. The pipeline will have a model that's adapted
+    to the task using an adapter.
+
+    Args:
+        model_name (str): Name of the model to be loaded.
+        task_name (str): Name of the task for which the pipeline is loaded.
+        adapt_task_embedding (bool, optional): Flag to determine if task
+            embedding should be adapted. Defaults to True.
+        multilingual (bool, optional): Flag to determine if the model is
+            multilingual. Defaults to False.
+        device (int, optional): The device to run the pipeline on (-1 for CPU,
+            >= 0 for GPU ids). Defaults to -1.
+
+    Returns:
+        TextClassificationPipeline: Loaded text classification pipeline.
+
+    """
+    if multilingual or "mdeberta" in model_name:
+        multilingual = True
 
-    from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer
     from .models import Adapter
+
     try:
         import tasksource
     except:
-        raise ImportError('Requires tasksource.\n pip install tasksource')
-    task = tasksource.load_task(task_name,multilingual=multilingual)
+        raise ImportError("Requires tasksource.\n pip install tasksource")
+    task = tasksource.load_task(task_name, multilingual=multilingual)
 
-    model = AutoModelForSequenceClassification.from_pretrained(model_name,ignore_mismatched_sizes=True)
-    adapter = Adapter.from_pretrained(model_name.replace('-nli','')+'-adapters')
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name, ignore_mismatched_sizes=True
+    )
+    adapter = Adapter.from_pretrained(model_name.replace("-nli", "") + "-adapters")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = adapter.adapt_model_to_task(model, task_name)
-    model.config.id2label=task['train'].features['labels']._int2str
-    
+    model.config.id2label = task["train"].features["labels"]._int2str
+
     task_index = adapter.config.tasks.index(task_name)
-    
+
     if adapt_task_embedding:
         with torch.no_grad():
-            model.deberta.embeddings.word_embeddings.weight[tokenizer.cls_token_id]+=adapter.Z[task_index]
+            model.deberta.embeddings.word_embeddings.weight[
+                tokenizer.cls_token_id
+            ] += adapter.Z[task_index]
 
     pipe = TextClassificationPipeline(
-    model=model, tokenizer=tokenizer)
-    return pipe
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+        return_all_scores=return_all_scores,
+    )
+    return pipe