debeir.training.train_sentence_encoder
1from functools import partial 2from typing import List, Union 3 4import transformers 5from debeir.datasets.types import InputExample, RelevanceExample 6from debeir.training.utils import _train_sentence_transformer, tokenize_function 7from sentence_transformers.evaluation import SentenceEvaluator 8from transformers import AutoModel, SchedulerType, Trainer, TrainingArguments 9 10import datasets 11 12 13def train_biencoder(model_fp_or_name: str, output_dir: str, train_examples: List[InputExample], 14 dev_examples: List[InputExample], train_batch_size=32, num_epochs=3, warmup_steps=None, 15 evaluate_every_n_step: int = 1000, 16 special_tokens=None, pooling_mode=None, loss_func=None, 17 evaluator: SentenceEvaluator = None, *args, **kwargs): 18 """ 19 Train a universal sentence encoder 20 21 :param model_fp_or_name: The model name or path to the model 22 :param output_dir: Output directory to save model, logs etc. 23 :param train_examples: Training Examples 24 :param dev_examples: Dev examples 25 :param train_batch_size: Training batch size 26 :param num_epochs: Number of epochs 27 :param warmup_steps: Warmup steps for the scheduler 28 :param evaluate_every_n_step: Evaluate the model every n steps 29 :param special_tokens: Special tokens to add 30 :param pooling_mode: Pooling mode for a sentence transformer model 31 :param loss_func: Loss function(s) to use 32 :param evaluator: Evaluator to use 33 """ 34 35 return _train_sentence_transformer(model_fp_or_name, output_dir, train_examples, dev_examples, train_batch_size, 36 num_epochs, warmup_steps, evaluate_every_n_step, special_tokens, 37 pooling_mode, loss_func, evaluator) 38 39 40def train_huggingface_transformer(model_fp_or_name_or_cls: Union[str, transformers.PreTrainedModel], 41 tokenizer: transformers.PreTrainedTokenizer, 42 output_dir: str, 43 compute_metric_fn, 44 metric: datasets.Metric, 45 dataset: datasets.DatasetDict = None, 46 train_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None, 47 eval_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None, 48 train_batch_size=32, num_epochs=3, 49 learning_rate=5e-5, 50 lr_scheduler_type: SchedulerType = SchedulerType.CONSTANT_WITH_WARMUP, 51 optimizer: str = "adamw_hf", 52 warmup_ratio=0.1, evaluate_every_n_step: int = 1000, 53 pooling_mode=None, loss_func=None, 54 model_args=None, model_kwargs=None, 55 padding_strategy="max_length", 56 truncate=True, 57 special_tokens=None, 58 seed=42, 59 *args, **kwargs) -> Trainer: 60 """ 61 Train a transformer model using the Huggingface API 62 63 :param model_fp_or_name_or_cls: Model name or model class to instantiate 64 :param tokenizer: Tokenizer 65 :param output_dir: Output directory to write to 66 :param compute_metric_fn: Metric function to compute metrics 67 :param metric: Metric used by the compute_metric_fn 68 :param dataset: Huggingface Dataset Dict 69 :param train_dataset: Training dataset to be used by the Trainer class 70 :param eval_dataset: Evaluation dataset to be used by the Trainer class 71 :param train_batch_size: Batch size to use for training 72 :param num_epochs: Number of training epochs (default: 3) 73 :param learning_rate: Learning rate (default: 5e-5) 74 :param lr_scheduler_type: Learning rate type, see SchedulerType 75 :param optimizer: Optimizer 76 :param warmup_ratio: Warmup ratios as ratio of steps (default 0.1) 77 :param evaluate_every_n_step: Number of steps to evaluate 78 :param pooling_mode: Pooling mode for your model 79 :param loss_func: Loss function to instantiate model 80 :param model_args: Model arguments to pass 81 :param model_kwargs: Model keyword arguments 82 :param padding_strategy: Tokenization padding strategy 83 :param truncate: Truncate tokenization strategy 84 :param special_tokens: Special tokens to add to the tokenizer 85 :param seed: Dataset shuffle seed 86 :param args: 87 :param kwargs: 88 :return: 89 """ 90 91 if isinstance(model_fp_or_name_or_cls, str): 92 model = AutoModel.from_pretrained(model_fp_or_name_or_cls) 93 elif isinstance(model_fp_or_name_or_cls, type): 94 # is already instantiated 95 model = model_fp_or_name_or_cls 96 else: 97 # Is not instantiated 98 model = model_fp_or_name_or_cls(loss_func=loss_func, 99 pooling_mode=pooling_mode, 100 *model_args, **model_kwargs) 101 102 if special_tokens: 103 tokenizer.add_tokens(special_tokens, special_tokens=True) 104 tokenizer.resize_token_embeddings(len(tokenizer)) 105 106 tokenized_datasets = dataset.map( 107 partial( 108 tokenize_function, tokenizer, 109 padding_strategy=padding_strategy, 110 truncate=truncate 111 ), batched=True) 112 113 if dataset: 114 train_dataset = tokenized_datasets["train"].shuffle(seed=seed) 115 eval_dataset = tokenized_datasets["dev"].shuffle(seed=seed) 116 117 training_args = TrainingArguments(output_dir=output_dir, 118 per_gpu_train_batch_size=train_batch_size, 119 num_train_epochs=num_epochs, 120 warmup_ratio=warmup_ratio, 121 eval_steps=evaluate_every_n_step, 122 learning_rate=learning_rate, 123 lr_scheduler_type=lr_scheduler_type, 124 optim=optimizer, 125 prediction_loss_only=True if metric is None and compute_metric_fn is None else False, 126 fp16=True) 127 128 trainer = Trainer( 129 model=model, 130 args=training_args, 131 train_dataset=train_dataset, 132 eval_dataset=eval_dataset, 133 compute_metrics=partial(compute_metric_fn, metric) if compute_metric_fn else None, 134 ) 135 136 trainer.fit() 137 138 return trainer
def
train_biencoder( model_fp_or_name: str, output_dir: str, train_examples: List[debeir.datasets.types.InputExample], dev_examples: List[debeir.datasets.types.InputExample], train_batch_size=32, num_epochs=3, warmup_steps=None, evaluate_every_n_step: int = 1000, special_tokens=None, pooling_mode=None, loss_func=None, evaluator: sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator = None, *args, **kwargs):
14def train_biencoder(model_fp_or_name: str, output_dir: str, train_examples: List[InputExample], 15 dev_examples: List[InputExample], train_batch_size=32, num_epochs=3, warmup_steps=None, 16 evaluate_every_n_step: int = 1000, 17 special_tokens=None, pooling_mode=None, loss_func=None, 18 evaluator: SentenceEvaluator = None, *args, **kwargs): 19 """ 20 Train a universal sentence encoder 21 22 :param model_fp_or_name: The model name or path to the model 23 :param output_dir: Output directory to save model, logs etc. 24 :param train_examples: Training Examples 25 :param dev_examples: Dev examples 26 :param train_batch_size: Training batch size 27 :param num_epochs: Number of epochs 28 :param warmup_steps: Warmup steps for the scheduler 29 :param evaluate_every_n_step: Evaluate the model every n steps 30 :param special_tokens: Special tokens to add 31 :param pooling_mode: Pooling mode for a sentence transformer model 32 :param loss_func: Loss function(s) to use 33 :param evaluator: Evaluator to use 34 """ 35 36 return _train_sentence_transformer(model_fp_or_name, output_dir, train_examples, dev_examples, train_batch_size, 37 num_epochs, warmup_steps, evaluate_every_n_step, special_tokens, 38 pooling_mode, loss_func, evaluator)
Train a universal sentence encoder
Parameters
- model_fp_or_name: The model name or path to the model
- output_dir: Output directory to save model, logs etc.
- train_examples: Training Examples
- dev_examples: Dev examples
- train_batch_size: Training batch size
- num_epochs: Number of epochs
- warmup_steps: Warmup steps for the scheduler
- evaluate_every_n_step: Evaluate the model every n steps
- special_tokens: Special tokens to add
- pooling_mode: Pooling mode for a sentence transformer model
- loss_func: Loss function(s) to use
- evaluator: Evaluator to use
def
train_huggingface_transformer( model_fp_or_name_or_cls: Union[str, transformers.modeling_utils.PreTrainedModel], tokenizer: transformers.tokenization_utils.PreTrainedTokenizer, output_dir: str, compute_metric_fn, metric: datasets.metric.Metric, dataset: datasets.dataset_dict.DatasetDict = None, train_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None, eval_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None, train_batch_size=32, num_epochs=3, learning_rate=5e-05, lr_scheduler_type: transformers.trainer_utils.SchedulerType = <SchedulerType.CONSTANT_WITH_WARMUP: 'constant_with_warmup'>, optimizer: str = 'adamw_hf', warmup_ratio=0.1, evaluate_every_n_step: int = 1000, pooling_mode=None, loss_func=None, model_args=None, model_kwargs=None, padding_strategy='max_length', truncate=True, special_tokens=None, seed=42, *args, **kwargs) -> transformers.trainer.Trainer:
41def train_huggingface_transformer(model_fp_or_name_or_cls: Union[str, transformers.PreTrainedModel], 42 tokenizer: transformers.PreTrainedTokenizer, 43 output_dir: str, 44 compute_metric_fn, 45 metric: datasets.Metric, 46 dataset: datasets.DatasetDict = None, 47 train_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None, 48 eval_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None, 49 train_batch_size=32, num_epochs=3, 50 learning_rate=5e-5, 51 lr_scheduler_type: SchedulerType = SchedulerType.CONSTANT_WITH_WARMUP, 52 optimizer: str = "adamw_hf", 53 warmup_ratio=0.1, evaluate_every_n_step: int = 1000, 54 pooling_mode=None, loss_func=None, 55 model_args=None, model_kwargs=None, 56 padding_strategy="max_length", 57 truncate=True, 58 special_tokens=None, 59 seed=42, 60 *args, **kwargs) -> Trainer: 61 """ 62 Train a transformer model using the Huggingface API 63 64 :param model_fp_or_name_or_cls: Model name or model class to instantiate 65 :param tokenizer: Tokenizer 66 :param output_dir: Output directory to write to 67 :param compute_metric_fn: Metric function to compute metrics 68 :param metric: Metric used by the compute_metric_fn 69 :param dataset: Huggingface Dataset Dict 70 :param train_dataset: Training dataset to be used by the Trainer class 71 :param eval_dataset: Evaluation dataset to be used by the Trainer class 72 :param train_batch_size: Batch size to use for training 73 :param num_epochs: Number of training epochs (default: 3) 74 :param learning_rate: Learning rate (default: 5e-5) 75 :param lr_scheduler_type: Learning rate type, see SchedulerType 76 :param optimizer: Optimizer 77 :param warmup_ratio: Warmup ratios as ratio of steps (default 0.1) 78 :param evaluate_every_n_step: Number of steps to evaluate 79 :param pooling_mode: Pooling mode for your model 80 :param loss_func: Loss function to instantiate model 81 :param model_args: Model arguments to pass 82 :param model_kwargs: Model keyword arguments 83 :param padding_strategy: Tokenization padding strategy 84 :param truncate: Truncate tokenization strategy 85 :param special_tokens: Special tokens to add to the tokenizer 86 :param seed: Dataset shuffle seed 87 :param args: 88 :param kwargs: 89 :return: 90 """ 91 92 if isinstance(model_fp_or_name_or_cls, str): 93 model = AutoModel.from_pretrained(model_fp_or_name_or_cls) 94 elif isinstance(model_fp_or_name_or_cls, type): 95 # is already instantiated 96 model = model_fp_or_name_or_cls 97 else: 98 # Is not instantiated 99 model = model_fp_or_name_or_cls(loss_func=loss_func, 100 pooling_mode=pooling_mode, 101 *model_args, **model_kwargs) 102 103 if special_tokens: 104 tokenizer.add_tokens(special_tokens, special_tokens=True) 105 tokenizer.resize_token_embeddings(len(tokenizer)) 106 107 tokenized_datasets = dataset.map( 108 partial( 109 tokenize_function, tokenizer, 110 padding_strategy=padding_strategy, 111 truncate=truncate 112 ), batched=True) 113 114 if dataset: 115 train_dataset = tokenized_datasets["train"].shuffle(seed=seed) 116 eval_dataset = tokenized_datasets["dev"].shuffle(seed=seed) 117 118 training_args = TrainingArguments(output_dir=output_dir, 119 per_gpu_train_batch_size=train_batch_size, 120 num_train_epochs=num_epochs, 121 warmup_ratio=warmup_ratio, 122 eval_steps=evaluate_every_n_step, 123 learning_rate=learning_rate, 124 lr_scheduler_type=lr_scheduler_type, 125 optim=optimizer, 126 prediction_loss_only=True if metric is None and compute_metric_fn is None else False, 127 fp16=True) 128 129 trainer = Trainer( 130 model=model, 131 args=training_args, 132 train_dataset=train_dataset, 133 eval_dataset=eval_dataset, 134 compute_metrics=partial(compute_metric_fn, metric) if compute_metric_fn else None, 135 ) 136 137 trainer.fit() 138 139 return trainer
Train a transformer model using the Huggingface API
Parameters
- model_fp_or_name_or_cls: Model name or model class to instantiate
- tokenizer: Tokenizer
- output_dir: Output directory to write to
- compute_metric_fn: Metric function to compute metrics
- metric: Metric used by the compute_metric_fn
- dataset: Huggingface Dataset Dict
- train_dataset: Training dataset to be used by the Trainer class
- eval_dataset: Evaluation dataset to be used by the Trainer class
- train_batch_size: Batch size to use for training
- num_epochs: Number of training epochs (default: 3)
- learning_rate: Learning rate (default: 5e-5)
- lr_scheduler_type: Learning rate type, see SchedulerType
- optimizer: Optimizer
- warmup_ratio: Warmup ratios as ratio of steps (default 0.1)
- evaluate_every_n_step: Number of steps to evaluate
- pooling_mode: Pooling mode for your model
- loss_func: Loss function to instantiate model
- model_args: Model arguments to pass
- model_kwargs: Model keyword arguments
- padding_strategy: Tokenization padding strategy
- truncate: Truncate tokenization strategy
- special_tokens: Special tokens to add to the tokenizer
- seed: Dataset shuffle seed
- args:
- kwargs: