diff --git a/demos/embeddings/ovms_mteb.py b/demos/embeddings/ovms_mteb.py index 80c7b5678a..b08019f5ec 100644 --- a/demos/embeddings/ovms_mteb.py +++ b/demos/embeddings/ovms_mteb.py @@ -14,25 +14,38 @@ # limitations under the License. # -from __future__ import annotations - import argparse import mteb from mteb.models.model_implementations.openai_models import OpenAIModel from openai import OpenAI -parser = argparse.ArgumentParser(description='Run MTEB benchmark against OVMS embeddings endpoint') +parser = argparse.ArgumentParser( + description="Run MTEB benchmark against OVMS embeddings endpoint") parser.add_argument('--service_url', required=False, default='http://localhost:8000/v3/embeddings', - help='Specify url to embeddings endpoint. default:http://localhost:8000/v3/embeddings', dest='service_url') -parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5', + help='Specify url to embeddings endpoint.', + dest='service_url') +parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5', + help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5', dest='model_name') -parser.add_argument('--dataset', default='Banking77Classification', help='Dataset to benchmark. default: Banking77Classification', +parser.add_argument('--dataset', default='Banking77Classification', + help='Dataset to benchmark. default: Banking77Classification', dest='dataset') -parser.add_argument('--embed_dim', type=int, default=None, help='Embedding dimension. Auto-detected if not provided.', +parser.add_argument('--results_dir', default='results', help='Results directory. default: results', + dest='results_dir') +parser.add_argument('--verbosity', choices=['0', '1', '2', '3'], default='3', ) +parser.add_argument('--overwrite_results', action='store_true', default=True, ) +parser.add_argument('--eval_splits', nargs='*', default=None, + help='Evaluation splits to use, e.g. --eval_splits test dev. If not set, ' + 'all splits defined in the task are used.', + dest='eval_splits') +parser.add_argument('--embed_dim', type=int, default=None, + help='Embedding dimension. Auto-detected if not provided.', dest='embed_dim') -parser.add_argument('--max_tokens', type=int, default=99999, help='Max input tokens for truncation. default: 99999', +parser.add_argument('--max_tokens', type=int, default=99999, + help='Max input tokens for truncation. default: 99999', dest='max_tokens') + args = vars(parser.parse_args()) client = OpenAI(base_url=args['service_url'], api_key="unused") @@ -48,10 +61,17 @@ embed_dim=embed_dim, client=client, ) - -tasks = mteb.get_task(args['dataset']) +tasks = mteb.get_task(args['dataset'], eval_splits=args["eval_splits"]) evaluation = mteb.MTEB(tasks=[tasks]) -evaluation.run(model, verbosity=3, overwrite_results=True, output_folder='results') +results = evaluation.run(model=model, + verbosity=int(args['verbosity']), + overwrite_results=args['overwrite_results'], + output_folder=args["results_dir"] + ) +for task_results in results: + # Banking77Classification: 0.8495 mean accuracy + print(f"{task_results.task_name}: {task_results.get_score():.4f} mean {task_results.task.metadata.main_score}") + # For full leaderboard tests set run: # benchmark = mteb.get_benchmark("MTEB(eng)") # evaluation = mteb.MTEB(tasks=benchmark)