Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions demos/embeddings/ovms_mteb.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,38 @@
# limitations under the License.
#

from __future__ import annotations

import argparse

import mteb
from mteb.models.model_implementations.openai_models import OpenAIModel
from openai import OpenAI

parser = argparse.ArgumentParser(description='Run MTEB benchmark against OVMS embeddings endpoint')
parser = argparse.ArgumentParser(
description="Run MTEB benchmark against OVMS embeddings endpoint")
parser.add_argument('--service_url', required=False, default='http://localhost:8000/v3/embeddings',
help='Specify url to embeddings endpoint. default:http://localhost:8000/v3/embeddings', dest='service_url')
parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5',
help='Specify url to embeddings endpoint.',
dest='service_url')
Comment on lines 25 to +27
parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5',
help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5',
dest='model_name')
parser.add_argument('--dataset', default='Banking77Classification', help='Dataset to benchmark. default: Banking77Classification',
parser.add_argument('--dataset', default='Banking77Classification',
help='Dataset to benchmark. default: Banking77Classification',
dest='dataset')
parser.add_argument('--embed_dim', type=int, default=None, help='Embedding dimension. Auto-detected if not provided.',
parser.add_argument('--results_dir', default='results', help='Results directory. default: results',
dest='results_dir')
parser.add_argument('--verbosity', choices=['0', '1', '2', '3'], default='3', )
parser.add_argument('--overwrite_results', action='store_true', default=True, )
parser.add_argument('--eval_splits', nargs='*', default=None,
help='Evaluation splits to use, e.g. --eval_splits test dev. If not set, '
'all splits defined in the task are used.',
dest='eval_splits')
parser.add_argument('--embed_dim', type=int, default=None,
help='Embedding dimension. Auto-detected if not provided.',
dest='embed_dim')
parser.add_argument('--max_tokens', type=int, default=99999, help='Max input tokens for truncation. default: 99999',
parser.add_argument('--max_tokens', type=int, default=99999,
help='Max input tokens for truncation. default: 99999',
dest='max_tokens')

args = vars(parser.parse_args())

client = OpenAI(base_url=args['service_url'], api_key="unused")
Expand All @@ -48,10 +61,17 @@
embed_dim=embed_dim,
client=client,
)

tasks = mteb.get_task(args['dataset'])
tasks = mteb.get_task(args['dataset'], eval_splits=args["eval_splits"])
evaluation = mteb.MTEB(tasks=[tasks])
evaluation.run(model, verbosity=3, overwrite_results=True, output_folder='results')
results = evaluation.run(model=model,
verbosity=int(args['verbosity']),
overwrite_results=args['overwrite_results'],
output_folder=args["results_dir"]
)
for task_results in results:
# Banking77Classification: 0.8495 mean accuracy
print(f"{task_results.task_name}: {task_results.get_score():.4f} mean {task_results.task.metadata.main_score}")

# For full leaderboard tests set run:
# benchmark = mteb.get_benchmark("MTEB(eng)")
# evaluation = mteb.MTEB(tasks=benchmark)
Expand Down