"Program" is now in additional_tokens -> _create_vocabulary() signatu…

…re changed
Natooz · Oct 26, 2021 · bd9aade · bd9aade
1 parent c3d6c89
commit bd9aade
Show file tree

Hide file tree

Showing 13 changed files with 55 additions and 85 deletions.
diff --git a/README.md b/README.md
@@ -190,22 +190,25 @@ These tokens bring additional information about the structure and content of MID
 * **Chords:** indicate the presence of a chord at a certain time step. MidiTok uses a chord detection method based on onset times and duration. This allows MidiTok to detect precisely chords without ambiguity, whereas most chord detection methods in symbolic music based on chroma features can't.
 * **Rests:** include "Rest" events whenever a segment of time is silent, i.e. no note is played within. This token type is decoded as a "Time-Shift" event, meaning the time will be shifted according to its value. You can choose the minimum and maximum rests values to represent (default is 1/2 beat to 8 beats). Note that rests shorter than one beat are only divisible by the first beat resolution, e.g. a rest of 5/8th of a beat will be a succession of ```Rest_0.4``` and ```Rest_0.1```, where the first number indicate the rest duration in beats and the second in samples / positions.
 * **Tempos:** specify the current tempo. This allows to train a model to predict tempo changes alongside with the notes, unless specified in the chart below. Tempo values are quantized on your specified range and number (default is 32 tempos from 40 to 250).
+* **Programs:** used to specify an instrument / MIDI program. MidiTok only offers the possibility to include these tokens in the vocabulary for you, but won't use them. If you need model multitrack symbolic music with other methods than Octuple / MuMIDI, MidiTok leaves you the choice / task to represent the track information the way you want. You can do it as in [LakhNES](https://github.com/chrisdonahue/LakhNES) or [MMM](https://metacreation.net/mmm-multi-track-music-machine/).
 
 Additionally, MidiTok offers to include *Program* tokens in the vocabulary of MIDI-Like, REMI and CP Word.
 We do not consider them additional tokens though as they are not used anywhere in MidiTok, but intended for you to insert them at the beginning of each sequence as *Start Of Sequence* tokens.
 
-|       | MIDI-Like     | REMI          | Compound Word | Structured | Octuple | MuMIDI        |
-|-------|:-------------:|:--------------:|:--------------:|:----------:|:--------:|:-------------:|
-| Chord | ✅             | ✅             | ✅             | ❌          | ❌       | ✅             |
-| Rest  | ✅             | ✅             | ✅             | ❌          | ❌       | ❌             |
-| Tempo | ✅<sup>1</sup> | ✅<sup>1</sup> | ✅<sup>1</sup> | ❌          | ✅       | ✅<sup>2</sup> |
+|       | MIDI-Like     | REMI          | Compound Word | Structured | Octuple | MuMIDI |
+|-------|:-------------:|:--------------:|:--------------:|:--------:|:-------:|:------:|
+| Chord | ✅             | ✅             | ✅             | ❌        | ❌      | ✅     |
+| Rest  | ✅             | ✅             | ✅             | ❌        | ❌      | ❌     |
+| Tempo | ✅<sup>1</sup> | ✅<sup>1</sup> | ✅<sup>1</sup> | ❌        | ✅      | ✅     |
+| Program | ✅           | ✅             | ✅             | ✅        | ✅<sup>3</sup>| ✅<sup>3</sup>|
 
 <sup>1</sup> Should not be used with multiple tracks. Otherwise, at decoding, only the events of the first track will be considered.\
-<sup>2</sup> Only used in the input as additional information. At decoding no tempo tokens should be predicted, _i.e_ will be considered.
+<sup>2</sup> Only used in the input as additional information. At decoding no tempo tokens should be predicted, _i.e_ will be considered.\
+<sup>3</sup> Integrated by default.
 
 ## Limitations
 
-For every tokenization method, MidiTok only support a 4/4 time signature for now.
+For the concerned tokenization methods, MidiTok only consider a 4/4 time signature for now. This means that each bar is considered covering 4 beats, and each beat is the duration of a quarter note.
 
 Future updates will support other time signatures, and time signature changes for compatible tokenizations.
 

diff --git a/miditok/constants.py b/miditok/constants.py
@@ -10,15 +10,13 @@
 ADDITIONAL_TOKENS = {'Chord': False,
                      'Rest': False,
                      'Tempo': False,
+                     'Program': False,
                      # rest params
                      'rest_range': (2, 8),  # (/min_rest, max_rest_in_BEAT), first divides a whole note/rest
                      # tempo params
                      'nb_tempos': 32,  # nb of tempo bins for additional tempo tokens, quantized like velocities
                      'tempo_range': (40, 250)}  # (min_tempo, max_tempo)
 
-# Used when creating the event <--> token dictionary
-PROGRAM_TOKENS = True  # will include tokens specifying the instrument of each sequence at its beginning
-
 # Defaults values when writing new MIDI files
 TIME_DIVISION = 384  # 384 and 480 are convenient as divisible by 4, 8, 12, 16, 24, 32
 TEMPO = 120

diff --git a/miditok/cp_word.py b/miditok/cp_word.py
@@ -38,22 +38,19 @@ class CPWordEncoding(MIDITokenizer):
             The values are the resolution, in samples per beat, of the given range, ex 8
     :param nb_velocities: number of velocity bins
     :param additional_tokens: specifies additional tokens (chords, time signature, rests, tempo...)
-    :param program_tokens: will add entries for MIDI programs in the dictionary, to use
-            in the case of multitrack generation for instance
-    :param sos_eos_tokens: Adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
+    :param sos_eos_tokens: adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
     :param params: can be a path to the parameter (json encoded) file or a dictionary
     """
 
     def __init__(self, pitch_range: range = PITCH_RANGE, beat_res: Dict[Tuple[int, int], int] = BEAT_RES,
                  nb_velocities: int = NB_VELOCITIES, additional_tokens: Dict[str, bool] = ADDITIONAL_TOKENS,
-                 program_tokens: bool = PROGRAM_TOKENS, sos_eos_tokens: bool = False, params=None):
+                 sos_eos_tokens: bool = False, params=None):
         # Indexes of additional token types within a compound token
         self.chord_idx = -3 if additional_tokens['Tempo'] and additional_tokens['Rest'] else -2 if \
             additional_tokens['Tempo'] or additional_tokens['Rest'] else -1
         self.rest_idx = -2 if additional_tokens['Tempo'] else -1
         self.tempo_idx = -1
-        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens,
-                         {'program_tokens': program_tokens, 'sos_eos_tokens': sos_eos_tokens}, params)
+        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens, sos_eos_tokens, params)
 
     def track_to_tokens(self, track: Instrument) -> List[List[int]]:
         """ Converts a track (miditoolkit.Instrument object) into a sequence of tokens
@@ -264,12 +261,11 @@ def tokens_to_track(self, tokens: List[List[int]], time_division: Optional[int]
         tempo_changes[0].time = 0
         return instrument, tempo_changes
 
-    def _create_vocabulary(self, program_tokens: bool, sos_eos_tokens: bool = False) -> Vocabulary:
+    def _create_vocabulary(self, sos_eos_tokens: bool = False) -> Vocabulary:
         """ Creates the Vocabulary object of the tokenizer.
         See the docstring of the Vocabulary class for more details about how to use it.
         NOTE: token index 0 is often used as a padding index during training
 
-        :param program_tokens: will include tokens for MIDI programs
         :param sos_eos_tokens: will include Start Of Sequence (SOS) and End Of Sequence (tokens)
         :return: the vocabulary object
         """
@@ -309,7 +305,7 @@ def _create_vocabulary(self, program_tokens: bool, sos_eos_tokens: bool = False)
             vocab.add_event(f'Tempo_{i}' for i in self.tempos)
 
         # PROGRAM
-        if program_tokens:
+        if self.additional_tokens['Program']:
             vocab.add_event('Family_Program')
             vocab.add_event(f'Program_{program}' for program in range(-1, 128))
 
@@ -327,17 +323,13 @@ def _create_token_types_graph(self) -> Dict[str, List[str]]:
         Position/Chord/Tempo and Pitch/Velocity/Duration
         Here the combination of Pitch, Velocity and Duration tokens is represented by
         "Pitch" in the graph.
+        NOTE: Program type is not referenced here, you can add it manually by
+        modifying the tokens_types_graph class attribute following your strategy.
 
         :return: the token types transitions dictionary
         """
         dic = dict()
 
-        try:
-            _ = self.vocab.tokens_of_type('Program')
-            dic['Program'] = ['Bar']
-        except KeyError:
-            pass
-
         dic['Bar'] = ['Position']
         dic['Position'] = ['Pitch']
         dic['Pitch'] = ['Pitch', 'Bar', 'Position']

diff --git a/miditok/midi_like.py b/miditok/midi_like.py
@@ -33,16 +33,13 @@ class MIDILikeEncoding(MIDITokenizer):
             The values are the resolution, in samples per beat, of the given range, ex 8
     :param nb_velocities: number of velocity bins
     :param additional_tokens: specifies additional tokens (chords, time signature, rests, tempo...)
-    :param program_tokens: will add entries for MIDI programs in the dictionary, to use
-            in the case of multitrack generation for instance
-    :param sos_eos_tokens: Adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
+    :param sos_eos_tokens: adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
     :param params: can be a path to the parameter (json encoded) file or a dictionary
     """
     def __init__(self, pitch_range: range = PITCH_RANGE, beat_res: Dict[Tuple[int, int], int] = BEAT_RES,
                  nb_velocities: int = NB_VELOCITIES, additional_tokens: Dict[str, bool] = ADDITIONAL_TOKENS,
-                 program_tokens: bool = PROGRAM_TOKENS, sos_eos_tokens: bool = False, params=None):
-        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens,
-                         {'program_tokens': program_tokens, 'sos_eos_tokens': sos_eos_tokens}, params)
+                 sos_eos_tokens: bool = False, params=None):
+        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens, sos_eos_tokens, params)
 
     def track_to_tokens(self, track: Instrument) -> List[int]:
         """ Converts a track (miditoolkit.Instrument object) into a sequence of tokens
@@ -184,12 +181,11 @@ def tokens_to_track(self, tokens: List[int], time_division: Optional[int] = TIME
         tempo_changes[0].time = 0
         return instrument, tempo_changes
 
-    def _create_vocabulary(self, program_tokens: bool, sos_eos_tokens: bool = False) -> Vocabulary:
+    def _create_vocabulary(self, sos_eos_tokens: bool = False) -> Vocabulary:
         """ Creates the Vocabulary object of the tokenizer.
         See the docstring of the Vocabulary class for more details about how to use it.
         NOTE: token index 0 is often used as a padding index during training
 
-        :param program_tokens: will include tokens for MIDI programs
         :param sos_eos_tokens: will include Start Of Sequence (SOS) and End Of Sequence (tokens)
         :return: the vocabulary object
         """
@@ -221,7 +217,7 @@ def _create_vocabulary(self, program_tokens: bool, sos_eos_tokens: bool = False)
             vocab.add_event(f'Tempo_{i}' for i in self.tempos)
 
         # PROGRAM
-        if program_tokens:
+        if self.additional_tokens['Program']:
             vocab.add_event(f'Program_{program}' for program in range(-1, 128))
 
         # SOS & EOS
@@ -233,17 +229,13 @@ def _create_vocabulary(self, program_tokens: bool, sos_eos_tokens: bool = False)
     def _create_token_types_graph(self) -> Dict[str, List[str]]:
         """ Returns a graph (as a dictionary) of the possible token
         types successions.
+        NOTE: Program type is not referenced here, you can add it manually by
+        modifying the tokens_types_graph class attribute following your strategy.
 
         :return: the token types transitions dictionary
         """
         dic = dict()
 
-        try:
-            _ = self.vocab.tokens_of_type('Program')
-            dic['Program'] = ['Note-On', 'Time-Shift']
-        except KeyError:
-            pass
-
         dic['Note-On'] = ['Velocity']
         dic['Velocity'] = ['Note-On', 'Time-Shift']
         dic['Time-Shift'] = ['Note-Off', 'Note-On']

diff --git a/miditok/midi_tokenizer_base.py b/miditok/midi_tokenizer_base.py
@@ -28,12 +28,12 @@ class MIDITokenizer:
             The values are the resolution, in samples per beat, of the given range, ex 8
     :param nb_velocities: number of velocity bins
     :param additional_tokens: specifies additional tokens (chords, rests, tempo...)
-    :param vocab_args: arguments for the _create_vocabulary method
+    :param sos_eos_tokens: adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
     :param params: can be a path to the parameter (json encoded) file or a dictionary
     """
 
     def __init__(self, pitch_range: range, beat_res: Dict[Tuple[int, int], int], nb_velocities: int,
-                 additional_tokens: Dict[str, Union[bool, int, Tuple[int, int]]], vocab_args: Dict[str, any],
+                 additional_tokens: Dict[str, Union[bool, int, Tuple[int, int]]], sos_eos_tokens: bool = False,
                  params: Union[str, Path, PurePath, Dict[str, Any]] = None):
         # Initialize params
         if params is None:
@@ -68,7 +68,7 @@ def __init__(self, pitch_range: range, beat_res: Dict[Tuple[int, int], int], nb_
             self.rests = self.__create_rests()
 
         # Vocabulary and token types graph
-        self.vocab = self._create_vocabulary(**vocab_args)
+        self.vocab = self._create_vocabulary(sos_eos_tokens)
         self.tokens_types_graph = self._create_token_types_graph()
 
         # Keep in memory durations in ticks for seen time divisions so these values
@@ -401,7 +401,7 @@ def tokenize_midi_dataset(self, midi_paths: Union[List[str], List[Path], List[Pu
         self.save_params(out_dir)  # Saves the parameters with which the MIDIs are converted
 
     @staticmethod
-    def save_tokens(tokens, path: Union[str, Path, PurePath], programs=None):
+    def save_tokens(tokens, path: Union[str, Path, PurePath], programs: List[Tuple[int, bool]] = None):
         """ Saves tokens as a JSON file.
 
         :param tokens: tokens, as any format

diff --git a/miditok/mumidi.py b/miditok/mumidi.py
@@ -31,7 +31,7 @@ class MuMIDIEncoding(MIDITokenizer):
             The values are the resolution, in samples per beat, of the given range, ex 8
     :param nb_velocities: number of velocity bins
     :param additional_tokens: specifies additional tokens (chords, time signature, rests, tempo)
-    :param sos_eos_tokens: Adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
+    :param sos_eos_tokens: adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
     :param params: can be a path to the parameter (json encoded) file or a dictionary
     :param drum_pitch_range: range of used MIDI pitches for drums exclusively
     """
@@ -42,8 +42,7 @@ def __init__(self, pitch_range: range = PITCH_RANGE, beat_res: Dict[Tuple[int, i
         self.drum_pitch_range = drum_pitch_range
         # used in place of positional encoding
         self.max_bar_embedding = 60  # this attribute might increase during encoding
-        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens,
-                         {'sos_eos_tokens': sos_eos_tokens}, params)
+        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens, sos_eos_tokens, params)
 
     def save_params(self, out_dir: Union[str, Path, PurePath]):
         """ Override the parent class method to include additional parameter drum pitch range
@@ -373,6 +372,6 @@ def _create_token_types_graph(self) -> Dict[str, List[str]]:
 
         if self.additional_tokens['Chord']:
             dic['Position'] += ['Chord']
-            dic['Chord'] += ['Program']
+            dic['Chord'] = ['Program']
 
         return dic
diff --git a/miditok/octuple.py b/miditok/octuple.py
@@ -26,7 +26,7 @@ class OctupleEncoding(MIDITokenizer):
             The values are the resolution, in samples per beat, of the given range, ex 8
     :param nb_velocities: number of velocity bins
     :param additional_tokens: specifies additional tokens (time signature, tempo)
-    :param sos_eos_tokens: Adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
+    :param sos_eos_tokens: adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
     :param params: can be a path to the parameter (json encoded) file or a dictionary
     """
     def __init__(self, pitch_range: range = PITCH_RANGE, beat_res: Dict[Tuple[int, int], int] = BEAT_RES,
@@ -36,8 +36,7 @@ def __init__(self, pitch_range: range = PITCH_RANGE, beat_res: Dict[Tuple[int, i
         additional_tokens['Rest'] = False
         # used in place of positional encoding
         self.max_bar_embedding = 60  # this attribute might increase during encoding
-        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens,
-                         {'sos_eos_tokens': sos_eos_tokens}, params)
+        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens, sos_eos_tokens, params)
 
     def save_params(self, out_dir: Union[str, Path, PurePath]):
         """ Override the parent class method to include additional parameter drum pitch range

diff --git a/miditok/octuple_mono.py b/miditok/octuple_mono.py
@@ -26,7 +26,7 @@ class OctupleMonoEncoding(MIDITokenizer):
             The values are the resolution, in samples per beat, of the given range, ex 8
     :param nb_velocities: number of velocity bins
     :param additional_tokens: specifies additional tokens (time signature, tempo)
-    :param sos_eos_tokens: Adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
+    :param sos_eos_tokens: adds Start Of Sequence (SOS) and End Of Sequence (EOS) tokens to the vocabulary
     :param params: can be a path to the parameter (json encoded) file or a dictionary
     """
 
@@ -35,10 +35,10 @@ def __init__(self, pitch_range: range = PITCH_RANGE, beat_res: Dict[Tuple[int, i
                  sos_eos_tokens: bool = False, params=None):
         additional_tokens['Chord'] = False  # Incompatible additional token
         additional_tokens['Rest'] = False
+        additional_tokens['Program'] = False
         # used in place of positional encoding
         self.max_bar_embedding = 60  # this attribute might increase during encoding
-        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens,
-                         {'sos_eos_tokens': sos_eos_tokens}, params)
+        super().__init__(pitch_range, beat_res, nb_velocities, additional_tokens, sos_eos_tokens, params)
 
     def save_params(self, out_dir: Union[str, Path, PurePath]):
         """ Override the parent class method to include additional parameter drum pitch range