From c5cb34aae41505af677e5ff3a1c0276ddcdf755d Mon Sep 17 00:00:00 2001 From: stephantul Date: Thu, 30 Apr 2026 09:50:28 +0200 Subject: [PATCH] chore: fix pre-commit --- .pre-commit-config.yaml | 41 ++++++++++++++-------------- Makefile | 10 ++++--- model2vec/distill/distillation.py | 13 ++++----- model2vec/distill/inference.py | 21 +++++--------- model2vec/distill/utils.py | 3 +- model2vec/inference/model.py | 24 ++++++---------- model2vec/model.py | 34 ++++++++--------------- model2vec/modelcards/modelcards.py | 3 +- model2vec/persistence/hf.py | 6 ++-- model2vec/persistence/persistence.py | 6 ++-- model2vec/quantization.py | 6 ++-- model2vec/tokenizer/tokenizer.py | 6 ++-- model2vec/train/base.py | 12 +++----- model2vec/train/classifier.py | 15 ++++------ model2vec/train/dataset.py | 3 +- model2vec/train/similarity.py | 3 +- model2vec/train/utils.py | 3 +- model2vec/utils.py | 15 +--------- pyproject.toml | 8 ++---- scripts/export_to_onnx.py | 13 +++------ 20 files changed, 89 insertions(+), 156 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0557041..714daf6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,36 +1,37 @@ -# See https://pre-commit.com for more information -# See https://pre-commit.com/hooks.html for more hooks +minimum_pre_commit_version: "3.2.0" + +default_language_version: + python: python3 + repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v6.0.0 hooks: + - id: no-commit-to-branch + args: [--branch, main] - id: check-ast - description: Simply check whether files parse as valid python. + - id: check-yaml + - id: check-toml + - id: check-json + - id: check-merge-conflict + - id: debug-statements - id: trailing-whitespace - description: Trims trailing whitespace - id: end-of-file-fixer - description: Makes sure files end in a newline and only a newline. - id: check-added-large-files - args: ['--maxkb=5000'] - description: Prevent giant files from being committed. + args: ["--maxkb=5000"] - id: check-case-conflict - description: Check for files with names that would conflict on case-insensitive filesystems like MacOS/Windows. - - id: check-yaml - description: Check yaml files for syntax errors. - repo: https://github.com/jsh9/pydoclint - rev: 0.5.3 + rev: 0.8.3 hooks: - id: pydoclint - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.13.0 + rev: v0.15.12 hooks: - - id: ruff - args: [ --fix ] + - id: ruff-check + args: [--fix] - id: ruff-format - - repo: local + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.15.0 hooks: - id: mypy - name: mypy - entry: mypy - language: python - types: [python] + additional_dependencies: [] diff --git a/Makefile b/Makefile index ee08f25..276817e 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,4 @@ -clean: - +VERBOSITY= venv: uv venv @@ -9,7 +8,7 @@ install: uv run pre-commit install install-no-pre-commit: - uv pip install ".[dev,distill,inference,train]" + uv pip install ".[dev,distill,inference,train,onnx,quantization]" install-base: uv sync --extra dev @@ -18,4 +17,7 @@ fix: uv run pre-commit run --all-files test: - uv run pytest --cov=model2vec --cov-report=term-missing + uv run pytest --cov=model2vec --cov-report=term-missing $(VERBOSITY) + +test-verbose: + make test VERBOSITY="-vvv" diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py index 589d791..7a29553 100644 --- a/model2vec/distill/distillation.py +++ b/model2vec/distill/distillation.py @@ -33,8 +33,7 @@ def distill_from_model( vocabulary_quantization: int | None = None, pooling: PoolingMode | str = PoolingMode.MEAN, ) -> StaticModel: - """ - Distill a staticmodel from a sentence transformer. + """Distill a staticmodel from a sentence transformer. This function creates a set of embeddings from a sentence transformer. It does this by doing either a forward pass for all subword tokens in the tokenizer, or by doing a forward pass for all tokens in a passed @@ -65,7 +64,7 @@ def distill_from_model( 'first': use the first token's hidden state ([CLS] token in BERT-style models). 'pooler': use the pooler output (if available). This is often a non-linear projection of the [CLS] token. :return: A StaticModel. - :raises: ValueError if the vocabulary is empty after preprocessing. + :raises ValueError: if the vocabulary is empty after preprocessing. """ quantize_to = DType(quantize_to) @@ -168,15 +167,14 @@ def _validate_parameters( sif_coefficient: float | None, token_remove_pattern: str | None, ) -> tuple[float | None, re.Pattern[str] | None]: - """ - Validate the parameters passed to the distillation function. + """Validate the parameters passed to the distillation function. :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied. Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default. :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary. :return: The SIF coefficient to use. - :raises: ValueError if the regex can't be compiled. + :raises ValueError: if the regex can't be compiled. """ if sif_coefficient is not None: @@ -205,8 +203,7 @@ def distill( vocabulary_quantization: int | None = None, pooling: PoolingMode | str = PoolingMode.MEAN, ) -> StaticModel: - """ - Distill a staticmodel from a sentence transformer. + """Distill a staticmodel from a sentence transformer. This function creates a set of embeddings from a sentence transformer. It does this by doing either a forward pass for all subword tokens in the tokenizer, or by doing a forward pass for all tokens in a passed diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py index a941dda..942efab 100644 --- a/model2vec/distill/inference.py +++ b/model2vec/distill/inference.py @@ -23,8 +23,7 @@ class PoolingMode(str, Enum): - """ - Pooling modes for embedding creation. + """Pooling modes for embedding creation. - MEAN: masked mean over all tokens. - LAST: last non-padding token (often EOS, common in decoder-style models). @@ -48,8 +47,7 @@ def create_embeddings( pad_token_id: int, pooling: PoolingMode | str = PoolingMode.MEAN, ) -> np.ndarray: - """ - Create output embeddings for a bunch of tokens using a pretrained model. + """Create output embeddings for a bunch of tokens using a pretrained model. It does a forward pass for all tokens passed in `tokens`. @@ -121,8 +119,7 @@ def create_embeddings( def _encode_with_model( model: PreTrainedModel, encodings: dict[str, torch.Tensor] ) -> tuple[torch.Tensor, torch.Tensor | None, dict[str, torch.Tensor]]: - """ - Move inputs to the model device, run a forward pass, and standardize dtypes. + """Move inputs to the model device, run a forward pass, and standardize dtypes. :param model: The model to use. :param encodings: The encoded tokens to turn into features. @@ -146,8 +143,7 @@ def _encode_with_model( @torch.inference_mode() def _encode_mean_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor: - """ - Encode a batch of tokens using mean pooling. + """Encode a batch of tokens using mean pooling. :param model: The model to use. :param encodings: The encoded tokens to turn into features. @@ -163,8 +159,7 @@ def _encode_mean_with_model(model: PreTrainedModel, encodings: dict[str, torch.T @torch.inference_mode() def _encode_last_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor: - """ - Encode a batch of tokens using last token pooling. + """Encode a batch of tokens using last token pooling. :param model: The model to use. :param encodings: The encoded tokens to turn into features. @@ -179,8 +174,7 @@ def _encode_last_with_model(model: PreTrainedModel, encodings: dict[str, torch.T @torch.inference_mode() def _encode_first_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor: - """ - Encode a batch of tokens using first token (CLS) pooling. + """Encode a batch of tokens using first token (CLS) pooling. :param model: The model to use. :param encodings: The encoded tokens to turn into features. @@ -192,8 +186,7 @@ def _encode_first_with_model(model: PreTrainedModel, encodings: dict[str, torch. @torch.inference_mode() def _encode_pooler_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor: - """ - Encode a batch of tokens using pooler output. + """Encode a batch of tokens using pooler output. :param model: The model to use. :param encodings: The encoded tokens to turn into features. diff --git a/model2vec/distill/utils.py b/model2vec/distill/utils.py index 11ad6a5..2890495 100644 --- a/model2vec/distill/utils.py +++ b/model2vec/distill/utils.py @@ -9,8 +9,7 @@ def select_optimal_device(device: str | None) -> str: - """ - Get the optimal device to use based on backend availability. + """Get the optimal device to use based on backend availability. For Torch versions >= 2.8.0, MPS is disabled due to known performance regressions. diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py index 81164ba..5a56dd9 100644 --- a/model2vec/inference/model.py +++ b/model2vec/inference/model.py @@ -52,8 +52,7 @@ def __init__(self, model: StaticModel, head: Pipeline) -> None: def from_pretrained( cls: type[StaticModelPipeline], path: PathLike, token: str | None = None, trust_remote_code: bool = False ) -> StaticModelPipeline: - """ - Load a StaticModel from a local path or huggingface hub path. + """Load a StaticModel from a local path or huggingface hub path. NOTE: if you load a private model from the huggingface hub, you need to pass a token. @@ -74,8 +73,7 @@ def save_pretrained(self, path: str) -> None: def push_to_hub( self, repo_id: str, subfolder: str | None = None, token: str | None = None, private: bool = False ) -> None: - """ - Save a model to a folder, and then push that folder to the hf hub. + """Save a model to a folder, and then push that folder to the hf hub. :param repo_id: The id of the repository to push to. :param subfolder: The subfolder to push to. @@ -122,8 +120,7 @@ def predict( multiprocessing_threshold: int = 10_000, threshold: float = 0.5, ) -> np.ndarray: - """ - Predict the labels of the input. + """Predict the labels of the input. :param X: The input data to predict. Can be a list of strings or a single string. :param show_progress_bar: Whether to display a progress bar during prediction. Defaults to False. @@ -162,8 +159,7 @@ def predict_proba( use_multiprocessing: bool = True, multiprocessing_threshold: int = 10_000, ) -> np.ndarray: - """ - Predict the labels of the input. + """Predict the labels of the input. :param X: The input data to predict. Can be a list of strings or a single string. :param show_progress_bar: Whether to display a progress bar during prediction. Defaults to False. @@ -190,8 +186,7 @@ def predict_proba( def evaluate( self, X: Sequence[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False ) -> str | dict[str, dict[str, float]]: - """ - Evaluate the classifier on a given dataset using scikit-learn's classification report. + """Evaluate the classifier on a given dataset using scikit-learn's classification report. :param X: The texts to predict on. :param y: The ground truth labels. @@ -212,8 +207,7 @@ def evaluate( def _load_pipeline( folder_or_repo_path: PathLike, token: str | None = None, trust_remote_code: bool = False ) -> tuple[StaticModel, Pipeline]: - """ - Load a model and an sklearn pipeline. + """Load a model and an sklearn pipeline. This assumes the following files are present in the repo: - `pipeline.skops`: The head of the pipeline. @@ -259,8 +253,7 @@ def _load_pipeline( def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> None: - """ - Save a pipeline to a folder. + """Save a pipeline to a folder. :param pipeline: The pipeline to save. :param folder_path: The path to the folder to save the pipeline to. @@ -296,8 +289,7 @@ def evaluate_single_or_multi_label( y: list[int] | list[str] | list[list[int]] | list[list[str]], output_dict: bool = False, ) -> str | dict[str, dict[str, float]]: - """ - Evaluate the classifier on a given dataset using scikit-learn's classification report. + """Evaluate the classifier on a given dataset using scikit-learn's classification report. :param predictions: The predictions. :param y: The ground truth labels. diff --git a/model2vec/model.py b/model2vec/model.py index 2680af4..e477c3c 100644 --- a/model2vec/model.py +++ b/model2vec/model.py @@ -34,8 +34,7 @@ def __init__( weights: np.ndarray | None = None, token_mapping: np.ndarray | None = None, ) -> None: - """ - Initialize the StaticModel. + """Initialize the StaticModel. :param vectors: The vectors to use. :param tokenizer: The Transformers tokenizer to use. @@ -49,7 +48,7 @@ def __init__( :param token_mapping: A mapping from token ids to indices in the vectors. If None, we don't remap the tokens during inference. This is only used for models that have undergone vocabulary quantization. - :raises: ValueError if the number of tokens does not match the number of vectors. + :raises ValueError: if the number of tokens does not match the number of vectors. """ super().__init__() tokens, _ = zip(*sorted(tokenizer.get_vocab().items(), key=lambda x: x[1])) @@ -95,8 +94,7 @@ def dim(self) -> int: @property def normalize(self) -> bool: - """ - Get the normalize value. + """Get the normalize value. :return: The normalize value. """ @@ -125,8 +123,7 @@ def vocabulary_quantization(self) -> int | None: return int(self.embedding.shape[0]) if is_quantized else None def save_pretrained(self, path: PathLike, model_name: str | None = None, subfolder: str | None = None) -> None: - """ - Save the pretrained model. + """Save the pretrained model. :param path: The path to save to. :param model_name: The model name to use in the Model Card. @@ -148,8 +145,7 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None, subfold ) def tokenize(self, sentences: Sequence[str], max_length: int | None = None) -> list[list[int]]: - """ - Tokenize a list of sentences. + """Tokenize a list of sentences. :param sentences: The sentences to tokenize. :param max_length: The maximum length of the sentences in tokens. If this is None, sequences @@ -189,8 +185,7 @@ def from_pretrained( vocabulary_quantization: int | None = None, force_download: bool = True, ) -> StaticModel: - """ - Load a StaticModel from a local path or huggingface hub path. + """Load a StaticModel from a local path or huggingface hub path. NOTE: if you load a private model from the huggingface hub, you need to pass a token. @@ -280,8 +275,7 @@ def encode_as_sequence( use_multiprocessing: bool = True, multiprocessing_threshold: int = 10_000, ) -> list[np.ndarray] | np.ndarray: - """ - Encode a list of sentences as a list of numpy arrays of tokens. + """Encode a list of sentences as a list of numpy arrays of tokens. This is useful if you want to use the tokens for further processing, or if you want to do sequence modeling. @@ -357,8 +351,7 @@ def encode( multiprocessing_threshold: int = 10_000, **kwargs: Any, ) -> np.ndarray: - """ - Encode a list of sentences. + """Encode a list of sentences. This function encodes a list of sentences by averaging the word embeddings of the tokens in the sentence. For ease of use, we don't batch sentences together. @@ -411,8 +404,7 @@ def encode( return out_array def _encode_helper(self, id_list: list[int]) -> np.ndarray: - """ - Helper function to encode a list of ids. + """Helper function to encode a list of ids. This function is used to deduplicate the logic in `encode` and `encode_as_sequence`. It retrieves the embeddings for the given list of ids, applying weights if available. @@ -457,8 +449,7 @@ def _batch(sentences: Sequence[str], batch_size: int) -> Iterator[Sequence[str]] def push_to_hub( self, repo_id: str, private: bool = False, token: str | None = None, subfolder: str | None = None ) -> None: - """ - Push the model to the huggingface hub. + """Push the model to the huggingface hub. NOTE: you need to pass a token if you are pushing a private model. @@ -481,8 +472,7 @@ def quantize_model( quantize_to: str | DType | None = None, dimensionality: int | None = None, ) -> StaticModel: - """ - Quantize the model to a lower precision and possibly lower dimensionality. + """Quantize the model to a lower precision and possibly lower dimensionality. :param model: The model to quantize. :param vocabulary_quantization: The number of clusters to use for quantization. @@ -490,7 +480,7 @@ def quantize_model( :param dimensionality: The desired dimensionality of the model. This needs to be < than the current model dimensionality. :return: A new StaticModel with the quantized embeddings. - :raises: ValueError if the model is already quantized. + :raises ValueError: if the model is already quantized. """ token_mapping: np.ndarray | None weights: np.ndarray | None diff --git a/model2vec/modelcards/modelcards.py b/model2vec/modelcards/modelcards.py index 3e9e570..06e2a73 100644 --- a/model2vec/modelcards/modelcards.py +++ b/model2vec/modelcards/modelcards.py @@ -16,8 +16,7 @@ def create_model_card( template_path: str = "model_card_template.md", **kwargs: Any, ) -> None: - """ - Create a model card and store it in the specified path. + """Create a model card and store it in the specified path. :param folder_path: The path where the model card will be stored. :param base_model_name: The name of the base model. diff --git a/model2vec/persistence/hf.py b/model2vec/persistence/hf.py index c80e86d..c55b0af 100644 --- a/model2vec/persistence/hf.py +++ b/model2vec/persistence/hf.py @@ -10,8 +10,7 @@ def push_folder_to_hub( folder_path: Path, subfolder: str | None, repo_id: str, private: bool, token: str | None ) -> None: - """ - Push a model folder to the huggingface hub, including model card. + """Push a model folder to the huggingface hub, including model card. :param folder_path: The path to the folder. :param subfolder: The subfolder to push to. @@ -30,8 +29,7 @@ def push_folder_to_hub( def maybe_get_cached_model_path(model_id: str) -> Path | None: - """ - Gets the latest model path for a given identifier from the hugging face hub cache. + """Gets the latest model path for a given identifier from the hugging face hub cache. Returns None if there is no cached model. In this case, the model will be downloaded. """ diff --git a/model2vec/persistence/persistence.py b/model2vec/persistence/persistence.py index e2ffdba..bcbeb76 100644 --- a/model2vec/persistence/persistence.py +++ b/model2vec/persistence/persistence.py @@ -31,8 +31,7 @@ def save_pretrained( mapping: np.ndarray | None = None, **kwargs: Any, ) -> None: - """ - Save a model to a folder. + """Save a model to a folder. :param folder_path: The path to the folder. :param embeddings: The embeddings. @@ -85,8 +84,7 @@ def load_pretrained( token: str | None, force_download: bool, ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any], np.ndarray | None, np.ndarray | None]: - """ - Loads a pretrained model from a folder. + """Loads a pretrained model from a folder. :param folder_or_repo_path: The folder or repo path to load from. - If this is a local path, we will load from the local path. diff --git a/model2vec/quantization.py b/model2vec/quantization.py index fd0c687..f06bcd6 100644 --- a/model2vec/quantization.py +++ b/model2vec/quantization.py @@ -24,8 +24,7 @@ class DType(str, Enum): def quantize_embeddings(embeddings: np.ndarray, quantize_to: DType) -> np.ndarray: - """ - Quantize embeddings to a specified data type to reduce memory usage. + """Quantize embeddings to a specified data type to reduce memory usage. :param embeddings: The embeddings to quantize, as a numpy array. :param quantize_to: The data type to quantize to. @@ -62,8 +61,7 @@ def quantize_embeddings(embeddings: np.ndarray, quantize_to: DType) -> np.ndarra def quantize_and_reduce_dim( embeddings: np.ndarray, quantize_to: str | DType | None, dimensionality: int | None ) -> np.ndarray: - """ - Quantize embeddings to a datatype and reduce dimensionality. + """Quantize embeddings to a datatype and reduce dimensionality. :param embeddings: The embeddings to quantize and reduce, as a numpy array. :param quantize_to: The data type to quantize to. If None, no quantization is performed. diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py index d78bb10..c939a3b 100644 --- a/model2vec/tokenizer/tokenizer.py +++ b/model2vec/tokenizer/tokenizer.py @@ -13,8 +13,7 @@ def clean_and_create_vocabulary( vocabulary_to_add: list[str], token_remove_regex: re.Pattern[str] | None, ) -> TokenizerModel: - """ - Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary. + """Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary. This function removes duplicate tokens and tokens that are already in the model's vocabulary. It also removes the tokenizer's post-processor, which we do not use anyway. @@ -94,8 +93,7 @@ def _report_statistics(n_multiword: int, n_duplicate: int, n_regex_removed: int, def turn_tokens_into_ids(tokens: list[str], model: TokenizerModel) -> list[list[int]]: - """ - Convert a list of Token objects to their corresponding token ID sequences. + """Convert a list of Token objects to their corresponding token ID sequences. :param tokens: List of Token objects to convert :param model: The tokenizermodel of the tokenizer. diff --git a/model2vec/train/base.py b/model2vec/train/base.py index 8ce093e..34dbdf7 100644 --- a/model2vec/train/base.py +++ b/model2vec/train/base.py @@ -47,8 +47,7 @@ def __init__( freeze: bool = False, normalize: bool = True, ) -> None: - """ - Initialize a trainable StaticModel from a StaticModel. + """Initialize a trainable StaticModel from a StaticModel. :param vectors: The embeddings of the staticmodel. :param tokenizer: The tokenizer. @@ -172,8 +171,7 @@ def from_static_model( ) def _encode(self, input_ids: torch.Tensor) -> torch.Tensor: - """ - A forward pass and mean pooling. + """A forward pass and mean pooling. This function is analogous to `StaticModel.encode`, but reimplemented to allow gradients to pass through. @@ -218,8 +216,7 @@ def forward(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: return self.head(encoded), encoded def tokenize(self, texts: list[str], max_length: int | None = 512) -> torch.Tensor: - """ - Tokenize a bunch of strings into a single padded 2D tensor. + """Tokenize a bunch of strings into a single padded 2D tensor. Note that this is not used during training. @@ -368,8 +365,7 @@ def _determine_val_check_interval( return val_check_interval, check_val_every_epoch def _prepare_dataset(self, X: list[str], y: torch.Tensor, max_length: int = 512) -> TextDataset: - """ - Prepare a dataset. + """Prepare a dataset. :param X: The texts. :param y: The labels. diff --git a/model2vec/train/classifier.py b/model2vec/train/classifier.py index e12d385..acf102b 100644 --- a/model2vec/train/classifier.py +++ b/model2vec/train/classifier.py @@ -65,8 +65,7 @@ def classes(self) -> np.ndarray: def predict( self, X: list[str], show_progress_bar: bool = False, batch_size: int = 1024, threshold: float = 0.5 ) -> np.ndarray: - """ - Predict labels for a set of texts. + """Predict labels for a set of texts. In single-label mode, each prediction is a single class. In multilabel mode, each prediction is a list of classes. @@ -93,8 +92,7 @@ def predict( return np.array(pred) def predict_proba(self, X: list[str], show_progress_bar: bool = False, batch_size: int = 1024) -> np.ndarray: - """ - Predict probabilities for each class. + """Predict probabilities for each class. In single-label mode, returns softmax probabilities. In multilabel mode, returns sigmoid probabilities. @@ -125,8 +123,7 @@ def fit( validation_steps: int | None = None, random_seed: int = _DEFAULT_RANDOM_SEED, ) -> StaticModelForClassification: - """ - Fit a model. + """Fit a model. This function creates a Lightning Trainer object and fits the model to the data. It supports both single-label and multi-label classification. @@ -222,8 +219,7 @@ def _determine_class_weight( def evaluate( self, X: list[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False ) -> str | dict[str, dict[str, float]]: - """ - Evaluate the classifier on a given dataset using scikit-learn's classification report. + """Evaluate the classifier on a given dataset using scikit-learn's classification report. :param X: The texts to predict on. :param y: The ground truth labels. @@ -239,8 +235,7 @@ def evaluate( return report def _initialize_on_labels(self, y: LabelType) -> None: - """ - Sets the output dimensionality, the classes, and initializes the head. + """Sets the output dimensionality, the classes, and initializes the head. :param y: The labels. :raises ValueError: If the labels are inconsistent. diff --git a/model2vec/train/dataset.py b/model2vec/train/dataset.py index fc608c5..7cf0684 100644 --- a/model2vec/train/dataset.py +++ b/model2vec/train/dataset.py @@ -5,8 +5,7 @@ class TextDataset(Dataset): def __init__(self, tokenized_texts: list[list[int]], targets: torch.Tensor) -> None: - """ - A dataset of texts. + """A dataset of texts. :param tokenized_texts: The tokenized texts. Each text is a list of token ids. :param targets: The targets. diff --git a/model2vec/train/similarity.py b/model2vec/train/similarity.py index 227eb4f..6f1b049 100644 --- a/model2vec/train/similarity.py +++ b/model2vec/train/similarity.py @@ -61,8 +61,7 @@ def fit( validation_steps: int | None = None, random_seed: int = _DEFAULT_RANDOM_SEED, ) -> StaticModelForSimilarity: - """ - Fit a model. + """Fit a model. This function creates a Lightning Trainer object and fits the model to the data. We use early stopping. After training, the weights of the best model are loaded back into the model. diff --git a/model2vec/train/utils.py b/model2vec/train/utils.py index 683c7ff..91f5a85 100644 --- a/model2vec/train/utils.py +++ b/model2vec/train/utils.py @@ -77,8 +77,7 @@ def train_test_split( y: list, test_size: float, ) -> tuple[list[str], list[str], list, list]: - """ - Split the data. + """Split the data. For single-label classification, stratification is attempted (if possible). For multilabel classification, a random split is performed. diff --git a/model2vec/utils.py b/model2vec/utils.py index 3776302..9c9ec66 100644 --- a/model2vec/utils.py +++ b/model2vec/utils.py @@ -18,8 +18,7 @@ class ProgressParallel(Parallel): """A drop-in replacement for joblib.Parallel that shows a tqdm progress bar.""" def __init__(self, use_tqdm: bool = True, total: int | None = None, *args: Any, **kwargs: Any) -> None: - """ - Initialize the ProgressParallel object. + """Initialize the ProgressParallel object. :param use_tqdm: Whether to show the progress bar. :param total: Total number of tasks (batches) you expect to process. If None, @@ -86,15 +85,3 @@ def importable(module: str, extra: str) -> None: raise ImportError( f"`{module}`, is required. Please reinstall model2vec with the `{extra}` extra. `pip install model2vec[{extra}]`" ) - - -def setup_logging() -> None: - """Simple logging setup.""" - from rich.logging import RichHandler - - logging.basicConfig( - level="INFO", - format="%(name)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - handlers=[RichHandler(rich_tracebacks=True)], - ) diff --git a/pyproject.toml b/pyproject.toml index 9a8642f..15f050b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,9 +26,7 @@ dependencies = [ "jinja2", "joblib", "numpy", - "rich", "safetensors", - "setuptools", "tokenizers>=0.20", "tqdm", ] @@ -51,13 +49,13 @@ model2vec = [ [project.optional-dependencies] dev = [ - "black", "ipython", "mypy", "pre-commit", "pytest", "pytest-cov", "ruff", + "setuptools", ] distill = ["torch", "transformers<5.4.0", "scikit-learn", "skeletoken>=0.3.3"] @@ -73,7 +71,7 @@ quantization = ["scikit-learn"] "Source" = "https://github.com/MinishLab/model2vec" [tool.ruff] -exclude = [".venv/"] +exclude = [".venv/", "*.ipynb"] line-length = 120 target-version = "py310" @@ -99,7 +97,7 @@ ignore = [ # Allow self and cls to be untyped, and allow Any type "ANN001", "ANN002", "ANN401", # Pydocstyle ignores - "D100", "D101", "D104", "D203", "D212", "D401", + "D100", "D101", "D104", "D203", "D213", "D401", # Allow use of f-strings in logging "G004" ] diff --git a/scripts/export_to_onnx.py b/scripts/export_to_onnx.py index 7962f0a..dda4c5a 100644 --- a/scripts/export_to_onnx.py +++ b/scripts/export_to_onnx.py @@ -38,8 +38,7 @@ def __init__(self, model: StaticModel) -> None: self.median_token_length = model.median_token_length def forward(self, input_ids: torch.Tensor, offsets: torch.Tensor) -> torch.Tensor: - """ - Forward pass of the model. + """Forward pass of the model. :param input_ids: The input token ids. :param offsets: The offsets to compute the mean pooling. @@ -53,8 +52,7 @@ def forward(self, input_ids: torch.Tensor, offsets: torch.Tensor) -> torch.Tenso return embeddings def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple[torch.Tensor, torch.Tensor]: - """ - Tokenize the input sentences. + """Tokenize the input sentences. :param sentences: The input sentences. :param max_length: The maximum length of the input_ids. @@ -83,8 +81,7 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple def export_model_to_onnx(model_path: str, save_path: Path) -> None: - """ - Export the StaticModel to ONNX format and save tokenizer files. + """Export the StaticModel to ONNX format and save tokenizer files. :param model_path: The path to the pretrained StaticModel. :param save_path: The directory to save the model and related files. @@ -129,13 +126,11 @@ def export_model_to_onnx(model_path: str, save_path: Path) -> None: def save_tokenizer(tokenizer: Tokenizer, save_directory: Path) -> None: - """ - Save tokenizer files in a format compatible with Transformers. + """Save tokenizer files in a format compatible with Transformers. :param tokenizer: The tokenizer from the StaticModel. :param save_directory: The directory to save the tokenizer files. :raises FileNotFoundError: If config.json is not found in save_directory. - :raises FileNotFoundError: If tokenizer_config.json is not found in save_directory. :raises ValueError: If tokenizer_name is not found in config.json. """ tokenizer_json_path = save_directory / "tokenizer.json"