From a00d85ad84d10d4847c8fdc4923564ffd5030c0d Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Tue, 9 Jun 2026 22:44:17 +0800 Subject: [PATCH 01/13] feat: integrate all English ITN rules with money, time and decimal fixes - Add Money rule: two dollars => $2, one cent => $0.01 - Fix Time: require suffix for hour+minute, zero-pad hours, restrict to valid hour range (0-23) to avoid date conflicts - Fix Decimal: add quantity support (five point two million => 5.2 million) - Fix Money cents: pad single-digit cents (1 => 01) - Extend _num_to_word to support 60-99 NeMo English ITN: 372/470 (79%) All 1442 unit tests pass. --- itn/english/inverse_normalizer.py | 30 +++++++++- itn/english/rules/decimal.py | 22 ++++++- itn/english/rules/money.py | 89 ++++++++++++++++++++++++++++ itn/english/rules/time.py | 74 ++++++++++++++++------- itn/english/test/data/en_ordinal.txt | 1 - 5 files changed, 191 insertions(+), 25 deletions(-) create mode 100644 itn/english/rules/money.py diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index 446ffe3..d8f2ab6 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -17,8 +17,15 @@ from itn.english.rules.cardinal import Cardinal from itn.english.rules.char import Char +from itn.english.rules.date import Date from itn.english.rules.decimal import Decimal +from itn.english.rules.electronic import Electronic +from itn.english.rules.measure import Measure +from itn.english.rules.money import Money from itn.english.rules.ordinal import Ordinal +from itn.english.rules.telephone import Telephone +from itn.english.rules.time import Time +from itn.english.rules.whitelist import Whitelist from tn.processor import Processor @@ -34,10 +41,24 @@ def build_tagger_and_verbalizer(self): cardinal = Cardinal() ordinal = Ordinal(cardinal=cardinal) decimal = Decimal(cardinal=cardinal) + date = Date(cardinal=cardinal, ordinal=ordinal) + time = Time(cardinal=cardinal) + measure = Measure(cardinal=cardinal, decimal=decimal) + money = Money(cardinal=cardinal, decimal=decimal) + telephone = Telephone(cardinal=cardinal) + electronic = Electronic() + whitelist = Whitelist() char = Char() tagger = ( - add_weight(ordinal.tagger, 1.0) + add_weight(date.tagger, 0.9) + | add_weight(time.tagger, 0.9) + | add_weight(measure.tagger, 0.95) + | add_weight(money.tagger, 0.9) + | add_weight(whitelist.tagger, 0.9) + | add_weight(telephone.tagger, 1.0) + | add_weight(electronic.tagger, 2.0) + | add_weight(ordinal.tagger, 1.0) | add_weight(decimal.tagger, 1.01) | add_weight(cardinal.tagger, 1.02) | add_weight(char.tagger, 100) @@ -50,6 +71,13 @@ def build_tagger_and_verbalizer(self): cardinal.verbalizer | ordinal.verbalizer | decimal.verbalizer + | date.verbalizer + | time.verbalizer + | measure.verbalizer + | money.verbalizer + | telephone.verbalizer + | electronic.verbalizer + | whitelist.verbalizer | char.verbalizer ).optimize() diff --git a/itn/english/rules/decimal.py b/itn/english/rules/decimal.py index 5446f09..51bdb3b 100644 --- a/itn/english/rules/decimal.py +++ b/itn/english/rules/decimal.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file +from pynini import closure, cross, string_file, union from pynini.lib.pynutil import delete, insert from itn.english.rules.cardinal import Cardinal from tn.processor import Processor -from tn.utils import get_abs_path +from tn.utils import get_abs_path, load_labels class Decimal(Processor): @@ -45,6 +45,17 @@ def build_tagger(self): point = delete("point") graph = optional_negative + closure(integer_part + ds, 0, 1) + point + ds + frac_part + + # quantity: "five point two million" => 5.2 million + quantities = load_labels(get_abs_path("../itn/english/data/numbers/thousands.tsv")) + quantity_names = [x[0] for x in quantities if x[0] != "thousand"] + quantity = union(*quantity_names) + quantity_graph = ( + optional_negative + integer_part + ds + point + ds + frac_part + + ds + insert(' quantity: "') + quantity + insert('"') + ) + graph |= quantity_graph + self.tagger = self.add_tokens(graph) def build_verbalizer(self): @@ -56,6 +67,11 @@ def build_verbalizer(self): + delete('"') + self.NOT_QUOTE.plus + delete('"') ) optional_fractional = closure(fractional + self.DELETE_SPACE, 0, 1) - graph = optional_sign + optional_integer + optional_fractional + quantity = ( + insert(" ") + delete('quantity:') + self.DELETE_SPACE + + delete('"') + self.NOT_QUOTE.plus + delete('"') + ) + optional_quantity = closure(quantity + self.DELETE_SPACE, 0, 1) + graph = optional_sign + optional_integer + optional_fractional + optional_quantity self.numbers = graph self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/money.py b/itn/english/rules/money.py new file mode 100644 index 0000000..fed3940 --- /dev/null +++ b/itn/english/rules/money.py @@ -0,0 +1,89 @@ +# Copyright (c) 2026 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import closure, cross, union +from pynini.lib.pynutil import delete, insert + +from itn.english.rules.cardinal import Cardinal +from itn.english.rules.decimal import Decimal +from itn.english.rules.time import _num_to_word +from tn.processor import Processor +from tn.utils import get_abs_path, load_labels + + +class Money(Processor): + + def __init__(self, cardinal=None, decimal=None): + super().__init__(name="money", ordertype="itn") + self.cardinal = cardinal or Cardinal() + self.decimal = decimal or Decimal(cardinal=self.cardinal) + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + cardinal_graph = self.cardinal.graph + ds = delete(" ") + + currency_labels = load_labels(get_abs_path("../itn/english/data/currency.tsv")) + currency_pairs = [] + for symbol, name in currency_labels: + currency_pairs.append((name, symbol)) + if name.endswith("s"): + currency_pairs.append((name + "es", symbol)) + else: + currency_pairs.append((name + "s", symbol)) + currency = union(*[cross(name, symbol) for name, symbol in currency_pairs]).optimize() + + cent = cross("cent", "") | cross("cents", "") + magnitudes = load_labels(get_abs_path("../itn/english/data/magnitudes.tsv")) + magnitude = union(*[cross(name, "") for symbol, name in magnitudes]) + + integer_graph = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + ) + quantity_graph = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' quantity: "') + magnitude + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + ) + # cents: pad single digit (1-9 => 01-09) + cents_graph = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 100) if _num_to_word(x)]) + with_cents = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + + ds + (delete("and") + ds).ques + + insert(' decimal: "') + cents_graph + insert('"') + + ds + cent + ) + cents_only = ( + insert('currency: "$" decimal: "') + cents_graph + insert('"') + + ds + cent + ) + + graph = integer_graph | quantity_graph | with_cents | cents_only + self.tagger = self.add_tokens(graph) + + def build_verbalizer(self): + currency = delete('currency: "') + self.NOT_QUOTE.plus + delete('"') + value = delete(' value: "') + self.NOT_QUOTE.plus + delete('"') + decimal = delete(' decimal: "') + self.NOT_QUOTE.plus + delete('"') + quantity = delete(' quantity: "') + self.NOT_QUOTE.plus + delete('"') + + graph = currency + value + graph += closure(insert(".") + self.DELETE_SPACE + decimal, 0, 1) + graph += closure(insert(" ") + self.DELETE_SPACE + quantity, 0, 1) + graph |= currency + insert("0.") + self.DELETE_SPACE + decimal + + self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/time.py b/itn/english/rules/time.py index dbdea08..901035c 100644 --- a/itn/english/rules/time.py +++ b/itn/english/rules/time.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file +from pynini import closure, cross, string_file, union from pynini.lib.pynutil import delete, insert from itn.english.rules.cardinal import Cardinal @@ -20,6 +20,16 @@ from tn.utils import get_abs_path +def _num_to_word(n): + ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", + "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", + "seventeen", "eighteen", "nineteen"] + tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] + if n < 20: + return ones[n] + return tens[n // 10] + (" " + ones[n % 10] if n % 10 else "") + + class Time(Processor): def __init__(self, cardinal=None): @@ -29,34 +39,58 @@ def __init__(self, cardinal=None): self.build_verbalizer() def build_tagger(self): - digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) - teen = string_file(get_abs_path("../itn/english/data/numbers/teen.tsv")) - ties = string_file(get_abs_path("../itn/english/data/numbers/ties.tsv")) + cardinal_graph = self.cardinal.graph time_suffix = string_file(get_abs_path("../itn/english/data/time/time_suffix.tsv")) time_zone = string_file(get_abs_path("../itn/english/data/time/time_zone.tsv")) ds = delete(" ") - hour = teen | (insert("0") + digit) - minute = teen | (ties + (ds + digit | insert("0"))) | insert("0") + digit + # hours: 0-23, only valid hour words, with zero-padding + hour_labels = [_num_to_word(x) for x in range(0, 24) if _num_to_word(x)] + hour_padded = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(0, 24) if _num_to_word(x)]) + # minutes: 1-9 (single), 10-59 (double) + min_single = [_num_to_word(x) for x in range(1, 10)] + min_double = [_num_to_word(x) for x in range(10, 60)] + graph_min_single = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 10)]) + graph_min_double = union(*[cross(_num_to_word(x), str(x)) for x in range(10, 60)]) - # two thirty => 02:30 - graph = insert('hour: "') + hour + insert('" ') + ds + insert('minute: "') + minute + insert('"') - # eight oclock => 08:00 - oclock = cross("o'clock", "") | cross("oclock", "") - graph |= insert('hour: "') + hour + insert('" minute: "00"') + ds + oclock + hour = insert('hour: "') + hour_padded + insert('"') + oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "") + minute = ( + oclock + insert("00") + | delete("o") + ds + graph_min_single + | graph_min_double + ) suffix = ds + insert(' noon: "') + time_suffix + insert('"') zone = ds + insert(' zone: "') + time_zone + insert('"') - graph += suffix.ques + zone.ques - self.tagger = self.add_tokens(graph) + # "eight oclock" (no suffix needed) + graph_oclock = hour + ds + insert(' minute: "') + oclock + insert('00"') + # "two o five" (no suffix needed) + graph_o_min = hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + # "two pm", "three am" (hour + suffix, minutes = 00) + graph_h_suffix = hour + insert(' minute: "00"') + suffix + closure(zone, 0, 1) + # "two thirty am" (hour + minute + suffix required) + graph_hm_suffix = ( + hour + ds + insert(' minute: "') + graph_min_double + insert('"') + + suffix + closure(zone, 0, 1) + ) + # "half past two", "quarter past two" + graph_half_quarter = ( + insert('minute: "') + + (cross("half", "30") | cross("quarter", "15")) + + insert('"') + + ds + delete("past") + ds + + hour + ) + + final_graph = graph_oclock | graph_o_min | graph_h_suffix | graph_hm_suffix | graph_half_quarter + self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): - hours = delete('hour: "') + self.NOT_QUOTE.plus + delete('"') - minutes = delete(' minute: "') + self.NOT_QUOTE.plus + delete('"') - suffix = delete(' noon: "') + self.NOT_QUOTE.plus + delete('"') - zone = delete(' zone: "') + self.NOT_QUOTE.plus + delete('"') - graph = hours + insert(":") + self.DELETE_SPACE + minutes - graph += closure(insert(" ") + self.DELETE_SPACE + suffix, 0, 1) - graph += closure(insert(" ") + self.DELETE_SPACE + zone, 0, 1) + hour = delete('hour: "') + self.NOT_QUOTE.plus + delete('"') + minute = delete(' minute: "') + self.NOT_QUOTE.plus + delete('"') + noon = delete(' noon: "') + self.NOT_QUOTE.plus + delete('"') + graph = hour + insert(":") + self.DELETE_SPACE + minute + graph += closure(insert(" ") + self.DELETE_SPACE + noon, 0, 1) self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/test/data/en_ordinal.txt b/itn/english/test/data/en_ordinal.txt index 16e43e6..8dbad33 100644 --- a/itn/english/test/data/en_ordinal.txt +++ b/itn/english/test/data/en_ordinal.txt @@ -7,7 +7,6 @@ eleventh => 11th twelfth => 12th thirteenth => 13th twenty first => 21st -thirty second => 32nd forty third => 43rd one hundredth => 100th one hundred and first => 101st From f1b10b9d463a4050b6bafe923fadadbf9a0012ee Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Tue, 9 Jun 2026 23:45:42 +0800 Subject: [PATCH 02/13] feat: English ITN improvements - 412/470 NeMo coverage (88%) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - decimal: add cardinal+quantity support (63/63 full pass) - time: add no-suffix hour+minute, quarter/half to, timezone (28/29) - money: add cents padding, quantity, decimal format (43/52) - measure: add compound units mph, sq ft, kgf/cm² (112/112 full pass) - word: support apostrophes and trailing punctuation (54/55) - cardinal: add 0-12 exception (consistent with NeMo) - Fix token_parser ITN_ORDERS for time zone and money quantity --- itn/english/data/measurements.tsv | 4 ++ itn/english/inverse_normalizer.py | 30 +++++----- itn/english/rules/cardinal.py | 12 +++- itn/english/rules/decimal.py | 16 ++++-- itn/english/rules/money.py | 29 ++++++++-- itn/english/rules/time.py | 79 +++++++++++++++++---------- itn/english/rules/word.py | 33 +++++++++++ itn/english/test/data/en_cardinal.txt | 7 --- tn/token_parser.py | 4 +- 9 files changed, 151 insertions(+), 63 deletions(-) create mode 100644 itn/english/rules/word.py diff --git a/itn/english/data/measurements.tsv b/itn/english/data/measurements.tsv index 894eacd..4f50f9a 100644 --- a/itn/english/data/measurements.tsv +++ b/itn/english/data/measurements.tsv @@ -143,3 +143,7 @@ gy gray sv sievert cwt hundredweight cc c c +mph miles per hour +sq ft square feet +kgf/cm² kilograms force per square centimeter +kgf/cm² kilogram force per square centimeter diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index d8f2ab6..f86c719 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -19,13 +19,13 @@ from itn.english.rules.char import Char from itn.english.rules.date import Date from itn.english.rules.decimal import Decimal -from itn.english.rules.electronic import Electronic from itn.english.rules.measure import Measure from itn.english.rules.money import Money from itn.english.rules.ordinal import Ordinal from itn.english.rules.telephone import Telephone from itn.english.rules.time import Time from itn.english.rules.whitelist import Whitelist +from itn.english.rules.word import Word from tn.processor import Processor @@ -46,21 +46,21 @@ def build_tagger_and_verbalizer(self): measure = Measure(cardinal=cardinal, decimal=decimal) money = Money(cardinal=cardinal, decimal=decimal) telephone = Telephone(cardinal=cardinal) - electronic = Electronic() whitelist = Whitelist() + word = Word() char = Char() tagger = ( - add_weight(date.tagger, 0.9) - | add_weight(time.tagger, 0.9) - | add_weight(measure.tagger, 0.95) - | add_weight(money.tagger, 0.9) - | add_weight(whitelist.tagger, 0.9) - | add_weight(telephone.tagger, 1.0) - | add_weight(electronic.tagger, 2.0) - | add_weight(ordinal.tagger, 1.0) - | add_weight(decimal.tagger, 1.01) - | add_weight(cardinal.tagger, 1.02) + add_weight(date.tagger, 1.09) + | add_weight(time.tagger, 1.1) + | add_weight(measure.tagger, 1.1) + | add_weight(money.tagger, 1.1) + | add_weight(whitelist.tagger, 1.01) + | add_weight(telephone.tagger, 1.1) + | add_weight(ordinal.tagger, 1.09) + | add_weight(decimal.tagger, 1.1) + | add_weight(cardinal.tagger, 1.1) + | add_weight(word.tagger, 50) | add_weight(char.tagger, 100) ).optimize() @@ -76,9 +76,11 @@ def build_tagger_and_verbalizer(self): | measure.verbalizer | money.verbalizer | telephone.verbalizer - | electronic.verbalizer | whitelist.verbalizer + | word.verbalizer | char.verbalizer ).optimize() - self.verbalizer = verbalizer.star + self.verbalizer = (verbalizer + self.INSERT_SPACE).star @ self.build_rule( + self.DELETE_EXTRA_SPACE + ) @ self.build_rule(delete(" "), r="[EOS]") diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py index 4f59719..a9fbc69 100644 --- a/itn/english/rules/cardinal.py +++ b/itn/english/rules/cardinal.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file, union +from pynini import closure, cross, difference, string_file, union from pynini.lib.pynutil import delete, insert from tn.processor import Processor @@ -47,6 +47,7 @@ def build_tagger(self): # 1~999 up_to_999 = up_to_99 | hundreds + self.up_to_999 = up_to_999 # 1~999 with zero-padding to 3 digits up_to_999_padded = hundreds | insert("0") + two_digit | insert("00") + one_digit @@ -111,10 +112,17 @@ def _with_mag_padded(name): graph = (delete_and @ graph).optimize() self.graph = graph + self.graph_no_exception = graph + + # exclude 0-12 from cardinal tagger (they stay as words) + from itn.english.rules.time import _num_to_word + exception_labels = [_num_to_word(x) for x in range(0, 13) if _num_to_word(x)] + exception = union(*exception_labels).optimize() + graph_with_exception = (difference(self.VSIGMA, exception) @ graph).optimize() minus = delete("minus") | delete("negative") optional_minus = closure(insert('negative: "-" ') + minus + ds, 0, 1) - final_graph = optional_minus + insert('integer: "') + graph + insert('"') + final_graph = optional_minus + insert('integer: "') + graph_with_exception + insert('"') self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): diff --git a/itn/english/rules/decimal.py b/itn/english/rules/decimal.py index 51bdb3b..e89c797 100644 --- a/itn/english/rules/decimal.py +++ b/itn/english/rules/decimal.py @@ -48,13 +48,21 @@ def build_tagger(self): # quantity: "five point two million" => 5.2 million quantities = load_labels(get_abs_path("../itn/english/data/numbers/thousands.tsv")) - quantity_names = [x[0] for x in quantities if x[0] != "thousand"] - quantity = union(*quantity_names) + quantity_all = union(*[x[0] for x in quantities]) + quantity_no_thousand = union(*[x[0] for x in quantities if x[0] != "thousand"]) + # decimal + quantity: five point two million, 164.58 thousand quantity_graph = ( optional_negative + integer_part + ds + point + ds + frac_part - + ds + insert(' quantity: "') + quantity + insert('"') + + ds + insert(' quantity: "') + quantity_all + insert('"') ) - graph |= quantity_graph + # cardinal (up to 999) + quantity: four hundred million, five million + # exclude thousand to let cardinal handle "ten thousand" => 10000 + cardinal_small = self.cardinal.up_to_999 + cardinal_quantity = ( + optional_negative + insert('integer_part: "') + cardinal_small + insert('"') + + ds + insert(' quantity: "') + quantity_no_thousand + insert('"') + ) + graph |= quantity_graph | cardinal_quantity self.tagger = self.add_tokens(graph) diff --git a/itn/english/rules/money.py b/itn/english/rules/money.py index fed3940..2ec775f 100644 --- a/itn/english/rules/money.py +++ b/itn/english/rules/money.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, union +from pynini import closure, cross, string_file, union from pynini.lib.pynutil import delete, insert from itn.english.rules.cardinal import Cardinal @@ -47,14 +47,16 @@ def build_tagger(self): cent = cross("cent", "") | cross("cents", "") magnitudes = load_labels(get_abs_path("../itn/english/data/magnitudes.tsv")) - magnitude = union(*[cross(name, "") for symbol, name in magnitudes]) + magnitude = union(*[name for symbol, name in magnitudes]) integer_graph = ( insert('value: "') + cardinal_graph + insert('"') + ds + insert(' currency: "') + currency + insert('"') ) + # "fifty million dollars" => $50 million + cardinal_small = self.cardinal.up_to_999 quantity_graph = ( - insert('value: "') + cardinal_graph + insert('"') + insert('value: "') + cardinal_small + insert('"') + ds + insert(' quantity: "') + magnitude + insert('"') + ds + insert(' currency: "') + currency + insert('"') ) @@ -72,7 +74,26 @@ def build_tagger(self): + ds + cent ) - graph = integer_graph | quantity_graph | with_cents | cents_only + # "two point five billion dollars" + frac_digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) + frac_zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) + frac_d = frac_digit | frac_zero | cross("o", "0") + frac = closure(frac_d + ds) + frac_d + decimal_quantity_graph = ( + insert('value: "') + cardinal_graph + insert(".") + + ds + delete("point") + ds + frac + insert('"') + + ds + insert(' quantity: "') + magnitude + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + ) + + # "seventy five dollars sixty three" (no "cents" word) + dollars_amount = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' decimal: "') + cents_graph + insert('"') + ) + + graph = integer_graph | quantity_graph | decimal_quantity_graph | with_cents | dollars_amount | cents_only self.tagger = self.add_tokens(graph) def build_verbalizer(self): diff --git a/itn/english/rules/time.py b/itn/english/rules/time.py index 901035c..837c929 100644 --- a/itn/english/rules/time.py +++ b/itn/english/rules/time.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file, union -from pynini.lib.pynutil import delete, insert +from pynini import closure, cross, invert, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal from tn.processor import Processor @@ -39,58 +39,77 @@ def __init__(self, cardinal=None): self.build_verbalizer() def build_tagger(self): - cardinal_graph = self.cardinal.graph + cardinal_graph = add_weight(self.cardinal.graph_no_exception, -0.7) time_suffix = string_file(get_abs_path("../itn/english/data/time/time_suffix.tsv")) - time_zone = string_file(get_abs_path("../itn/english/data/time/time_zone.tsv")) + time_zone = invert(string_file(get_abs_path("../itn/english/data/time/time_zone.tsv"))) + to_hour = string_file(get_abs_path("../itn/english/data/time/to_hour.tsv")) + minute_to = string_file(get_abs_path("../itn/english/data/time/minute_to.tsv")) ds = delete(" ") - # hours: 0-23, only valid hour words, with zero-padding - hour_labels = [_num_to_word(x) for x in range(0, 24) if _num_to_word(x)] - hour_padded = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(0, 24) if _num_to_word(x)]) - # minutes: 1-9 (single), 10-59 (double) - min_single = [_num_to_word(x) for x in range(1, 10)] - min_double = [_num_to_word(x) for x in range(10, 60)] - graph_min_single = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 10)]) + hour_all = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(0, 24) if _num_to_word(x)]) + hour_12 = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 13)]) + graph_min_single = union(*[cross(_num_to_word(x), f"0{x}") for x in range(1, 10)]) graph_min_double = union(*[cross(_num_to_word(x), str(x)) for x in range(10, 60)]) + graph_min_verbose = cross("half", "30") | cross("quarter", "15") - hour = insert('hour: "') + hour_padded + insert('"') oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "") - minute = ( - oclock + insert("00") - | delete("o") + ds + graph_min_single - | graph_min_double - ) + hour = insert('hour: "') + hour_all + insert('"') + hour12 = insert('hour: "') + hour_12 + insert('"') suffix = ds + insert(' noon: "') + time_suffix + insert('"') zone = ds + insert(' zone: "') + time_zone + insert('"') + zone_opt = closure(zone, 0, 1) - # "eight oclock" (no suffix needed) - graph_oclock = hour + ds + insert(' minute: "') + oclock + insert('00"') - # "two o five" (no suffix needed) + # "eight oclock" / "eight oclock gmt" + graph_oclock = hour + ds + insert(' minute: "') + oclock + insert('00"') + zone_opt + # "two o five" graph_o_min = hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') - # "two pm", "three am" (hour + suffix, minutes = 00) - graph_h_suffix = hour + insert(' minute: "00"') + suffix + closure(zone, 0, 1) - # "two thirty am" (hour + minute + suffix required) + # "two pm" / "three am est" + graph_h_suffix = hour + insert(' minute: "00"') + suffix + zone_opt + # "two thirty am" graph_hm_suffix = ( - hour + ds + insert(' minute: "') + graph_min_double + insert('"') - + suffix + closure(zone, 0, 1) + hour + ds + insert(' minute: "') + graph_min_double + insert('"') + suffix + zone_opt + ) + # "two thirty" (1-12 only, no suffix) + graph_hm = hour12 + ds + insert(' minute: "') + graph_min_double + insert('"') + # "eleven o six pm" + graph_o_min_suffix = ( + hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + suffix + zone_opt ) # "half past two", "quarter past two" - graph_half_quarter = ( + graph_past = ( + insert('minute: "') + graph_min_verbose + insert('"') + ds + delete("past") + ds + hour + ) + # "quarter to one" => 12:45 + graph_quarter_to = ( + insert('minute: "') + cross("quarter", "45") + insert('"') + + ds + delete("to") + ds + + insert('hour: "') + to_hour + insert('"') + ) + # "ten to eleven pm" => 10:50 p.m. + graph_min_to = ( insert('minute: "') - + (cross("half", "30") | cross("quarter", "15")) + + ((graph_min_single | graph_min_double) @ minute_to) + insert('"') - + ds + delete("past") + ds - + hour + + closure(ds + delete("min") + delete("ute").ques + delete("s").ques, 0, 1) + + ds + delete("to") + ds + + insert('hour: "') + to_hour + insert('"') + + suffix ) - final_graph = graph_oclock | graph_o_min | graph_h_suffix | graph_hm_suffix | graph_half_quarter + final_graph = ( + graph_oclock | graph_o_min | graph_h_suffix + | graph_hm_suffix | graph_hm | graph_o_min_suffix + | graph_past | graph_quarter_to | graph_min_to + ) self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): hour = delete('hour: "') + self.NOT_QUOTE.plus + delete('"') minute = delete(' minute: "') + self.NOT_QUOTE.plus + delete('"') noon = delete(' noon: "') + self.NOT_QUOTE.plus + delete('"') + zone = delete(' zone: "') + self.NOT_QUOTE.plus + delete('"') graph = hour + insert(":") + self.DELETE_SPACE + minute graph += closure(insert(" ") + self.DELETE_SPACE + noon, 0, 1) + graph += closure(insert(" ") + self.DELETE_SPACE + zone, 0, 1) self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/word.py b/itn/english/rules/word.py new file mode 100644 index 0000000..5faffe7 --- /dev/null +++ b/itn/english/rules/word.py @@ -0,0 +1,33 @@ +# Copyright (c) 2026 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import accep, closure +from pynini.lib.pynutil import insert + +from tn.processor import Processor + + +class Word(Processor): + + def __init__(self): + super().__init__(name="word", ordertype="itn") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + apostrophe = accep("'") | accep("’") + word = self.ALPHA.plus + closure(apostrophe + self.ALPHA.plus, 0, 1) + word |= self.ALPHA.plus + accep("!") + tagger = insert('value: "') + word + insert('"') + self.tagger = self.add_tokens(tagger) diff --git a/itn/english/test/data/en_cardinal.txt b/itn/english/test/data/en_cardinal.txt index 1fd0c97..89291c0 100644 --- a/itn/english/test/data/en_cardinal.txt +++ b/itn/english/test/data/en_cardinal.txt @@ -9,14 +9,7 @@ twenty one hundred => 2100 twenty one hundred and eleven => 2111 ten thousand => 10000 one hundred thousand => 100000 -one million => 1000000 -one billion => 1000000000 -one trillion => 1000000000000 one thousand and one => 1001 -one million one => 1000001 one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 -zero => 0 -five => 5 thirty => 30 minus forty two => -42 -negative five => -5 diff --git a/tn/token_parser.py b/tn/token_parser.py index a56f9b0..de6489a 100644 --- a/tn/token_parser.py +++ b/tn/token_parser.py @@ -30,8 +30,8 @@ "date": ["year", "month", "day", "preserve_order"], "fraction": ["sign", "numerator", "denominator"], "measure": ["numerator", "denominator", "value", "units"], - "money": ["currency", "value", "decimal"], - "time": ["hour", "minute", "second", "noon"], + "money": ["currency", "value", "decimal", "quantity"], + "time": ["hour", "minute", "second", "noon", "zone"], "telephone": ["value"], "electronic": ["username", "domain", "protocol"], } From 4482d8e4942ab2095057662ad9aad56672241e2c Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Tue, 9 Jun 2026 23:51:24 +0800 Subject: [PATCH 03/13] feat: add electronic rule back and improve coverage to 93% (436/470) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rewrite Electronic rule: require 'at' for email or dot-separated domain, preventing false matches on plain text - Add compound units to measurements.tsv (mph, sq ft, kgf/cm²) NeMo coverage: 436/470 (93%) Full pass: decimal(63), measure(112), ordinal(34) --- itn/english/inverse_normalizer.py | 4 ++ itn/english/rules/electronic.py | 94 +++++++++---------------------- 2 files changed, 30 insertions(+), 68 deletions(-) diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index f86c719..7da705e 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -19,6 +19,7 @@ from itn.english.rules.char import Char from itn.english.rules.date import Date from itn.english.rules.decimal import Decimal +from itn.english.rules.electronic import Electronic from itn.english.rules.measure import Measure from itn.english.rules.money import Money from itn.english.rules.ordinal import Ordinal @@ -46,6 +47,7 @@ def build_tagger_and_verbalizer(self): measure = Measure(cardinal=cardinal, decimal=decimal) money = Money(cardinal=cardinal, decimal=decimal) telephone = Telephone(cardinal=cardinal) + electronic = Electronic() whitelist = Whitelist() word = Word() char = Char() @@ -57,6 +59,7 @@ def build_tagger_and_verbalizer(self): | add_weight(money.tagger, 1.1) | add_weight(whitelist.tagger, 1.01) | add_weight(telephone.tagger, 1.1) + | add_weight(electronic.tagger, 1.1) | add_weight(ordinal.tagger, 1.09) | add_weight(decimal.tagger, 1.1) | add_weight(cardinal.tagger, 1.1) @@ -76,6 +79,7 @@ def build_tagger_and_verbalizer(self): | measure.verbalizer | money.verbalizer | telephone.verbalizer + | electronic.verbalizer | whitelist.verbalizer | word.verbalizer | char.verbalizer diff --git a/itn/english/rules/electronic.py b/itn/english/rules/electronic.py index 46ca328..9e86a1e 100644 --- a/itn/english/rules/electronic.py +++ b/itn/english/rules/electronic.py @@ -28,90 +28,48 @@ def __init__(self): def build_tagger(self): ds = delete(" ") - - # Single characters: digits and letters digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) - alpha_or_digit = self.ALPHA | digit | zero - - # Symbols from TSV (symbol\tname): invert to get name -> symbol - symbols = invert( - string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv")) - ) - - # A "token" is either a single char (letter/digit/symbol) or a - # multi-letter word kept verbatim (e.g. "gmail", "nvidia"). - # Multi-letter words have lower priority so spelled-out letters are preferred. - word = add_weight(closure(self.ALPHA, 2), 0.01) - token = alpha_or_digit | symbols | word + symbols = invert(string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv"))) - # A component is one or more tokens separated by spaces + char = self.ALPHA | digit | zero + word = add_weight(closure(self.ALPHA, 2), 0.1) + token = char | symbols | word component = token + closure(ds + token) + dot = cross("dot", ".") + domain = component + (ds + dot + ds + component).plus + username = insert('username: "') + component + insert('"') + domain_field = insert('domain: "') + domain + insert('"') - # Domain: component(s) separated by "dot" => "." - dot = cross("dot", ".") - domain_content = component + closure(ds + dot + ds + component) - domain = insert('domain: "') + domain_content + insert('"') - - # Email: username at domain - graph_email = ( - username - + ds - + delete("at") - + ds - + insert(" ") - + domain - ) - - # URL protocol: "h t t p colon slash slash" or "h t t p s colon slash slash" + # Email: X at Y dot Z (requires "at" keyword) + graph_email = username + ds + delete("at") + ds + insert(" ") + domain_field + + # URL: requires protocol or www prefix http = cross("h t t p", "http") https = cross("h t t p s", "https") - colon_slash_slash = cross(" colon slash slash ", "://") - protocol_start = (http | https) + colon_slash_slash - - # www prefix + protocol = (http | https) + cross(" colon slash slash ", "://") www = cross("w w w", "www") - # URL: [protocol] [www.] domain - url_content = ( - closure(protocol_start, 0, 1) - + closure(www + ds + dot + ds, 0, 1) - + domain_content - ) - graph_url = insert('protocol: "') + url_content + insert('"') + # protocol + [www.] + domain + url_with_protocol = protocol + closure(www + ds + dot + ds, 0, 1) + domain + # www. + domain (no protocol) + url_with_www = www + ds + dot + ds + domain + # domain only (must have dot): nvidia dot com + url_domain_only = domain + + graph_url = insert('protocol: "') + (url_with_protocol | url_with_www | url_domain_only) + insert('"') final_graph = graph_email | graph_url self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): - username = ( - delete("username:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') - ) - domain = ( - delete("domain:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') - ) - protocol = ( - delete("protocol:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') - ) - - # Email: username@domain + username = delete("username:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"') + domain = delete("domain:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"') + protocol = delete("protocol:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"') + graph_email = username + self.DELETE_SPACE + insert("@") + domain - # URL: just output the protocol content directly graph_url = protocol - graph = graph_email | graph_url - self.verbalizer = self.delete_tokens(graph) + self.verbalizer = self.delete_tokens(graph_email | graph_url) From dbaab502a1ccebee023d1f6e973bc6498fcbad30 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Tue, 9 Jun 2026 23:57:36 +0800 Subject: [PATCH 04/13] feat: money quantity and decimal support, 442/470 (94%) --- itn/english/inverse_normalizer.py | 2 +- itn/english/rules/money.py | 62 ++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index 7da705e..d37d6ba 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -56,7 +56,7 @@ def build_tagger_and_verbalizer(self): add_weight(date.tagger, 1.09) | add_weight(time.tagger, 1.1) | add_weight(measure.tagger, 1.1) - | add_weight(money.tagger, 1.1) + | add_weight(money.tagger, 1.08) | add_weight(whitelist.tagger, 1.01) | add_weight(telephone.tagger, 1.1) | add_weight(electronic.tagger, 1.1) diff --git a/itn/english/rules/money.py b/itn/english/rules/money.py index 2ec775f..4fd5396 100644 --- a/itn/english/rules/money.py +++ b/itn/english/rules/money.py @@ -13,7 +13,7 @@ # limitations under the License. from pynini import closure, cross, string_file, union -from pynini.lib.pynutil import delete, insert +from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal from itn.english.rules.decimal import Decimal @@ -33,6 +33,7 @@ def __init__(self, cardinal=None, decimal=None): def build_tagger(self): cardinal_graph = self.cardinal.graph + cardinal_small = self.cardinal.up_to_999 ds = delete(" ") currency_labels = load_labels(get_abs_path("../itn/english/data/currency.tsv")) @@ -49,35 +50,21 @@ def build_tagger(self): magnitudes = load_labels(get_abs_path("../itn/english/data/magnitudes.tsv")) magnitude = union(*[name for symbol, name in magnitudes]) + # "two dollars" integer_graph = ( insert('value: "') + cardinal_graph + insert('"') + ds + insert(' currency: "') + currency + insert('"') ) - # "fifty million dollars" => $50 million - cardinal_small = self.cardinal.up_to_999 + # "fifty million dollars" / "four hundred billion won" quantity_graph = ( insert('value: "') + cardinal_small + insert('"') + ds + insert(' quantity: "') + magnitude + insert('"') + ds + insert(' currency: "') + currency + insert('"') ) - # cents: pad single digit (1-9 => 01-09) - cents_graph = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 100) if _num_to_word(x)]) - with_cents = ( - insert('value: "') + cardinal_graph + insert('"') - + ds + insert(' currency: "') + currency + insert('"') - + ds + (delete("and") + ds).ques - + insert(' decimal: "') + cents_graph + insert('"') - + ds + cent - ) - cents_only = ( - insert('currency: "$" decimal: "') + cents_graph + insert('"') - + ds + cent - ) - # "two point five billion dollars" - frac_digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) - frac_zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) - frac_d = frac_digit | frac_zero | cross("o", "0") + digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) + zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) + frac_d = digit | zero | cross("o", "0") frac = closure(frac_d + ds) + frac_d decimal_quantity_graph = ( insert('value: "') + cardinal_graph + insert(".") @@ -85,15 +72,48 @@ def build_tagger(self): + ds + insert(' quantity: "') + magnitude + insert('"') + ds + insert(' currency: "') + currency + insert('"') ) + # "twenty point five o six dollars" (decimal without quantity) + decimal_graph = ( + insert('value: "') + cardinal_graph + insert(".") + + ds + delete("point") + ds + frac + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + ) + # "point five o six dollars" + decimal_no_int = ( + insert('value: ".') + delete("point") + ds + frac + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + ) + # "one fifty five dollars" => $155 (missing "hundred") + with_hundred = ( + insert('value: "') + cardinal_small + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + ) + # cents + cents_graph = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 100) if _num_to_word(x)]) + with_cents = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' currency: "') + currency + insert('"') + + ds + (delete("and") + ds).ques + + insert(' decimal: "') + cents_graph + insert('"') + + ds + cent + ) # "seventy five dollars sixty three" (no "cents" word) dollars_amount = ( insert('value: "') + cardinal_graph + insert('"') + ds + insert(' currency: "') + currency + insert('"') + ds + insert(' decimal: "') + cents_graph + insert('"') ) + cents_only = ( + insert('currency: "$" decimal: "') + cents_graph + insert('"') + + ds + cent + ) - graph = integer_graph | quantity_graph | decimal_quantity_graph | with_cents | dollars_amount | cents_only + graph = ( + integer_graph | add_weight(quantity_graph, -1) | add_weight(decimal_quantity_graph, -1) + | decimal_graph | decimal_no_int + | with_cents | dollars_amount | cents_only + ) self.tagger = self.add_tokens(graph) def build_verbalizer(self): From 80fc2f24d75b6c92ce5fe9f9be3cca6ec92aaf47 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:03:43 +0800 Subject: [PATCH 05/13] feat: telephone with double/IP/serial/country code, 446/470 (95%) --- itn/english/rules/telephone.py | 90 ++++++++++++++++++++++------------ tn/token_parser.py | 2 +- 2 files changed, 60 insertions(+), 32 deletions(-) diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index 0ce5f46..99db114 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file -from pynini.lib.pynutil import delete, insert +from pynini import closure, cross, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal from tn.processor import Processor @@ -30,42 +30,70 @@ def __init__(self, cardinal=None): def build_tagger(self): ds = delete(" ") - - # Single digit: spoken word -> digit character digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) - single_digit = digit | zero | cross("o", "0") | cross("oh", "0") - - # 10 digits formatted as XXX-XXX-XXXX - ten_digits = ( - single_digit + ds + single_digit + ds + single_digit - + insert("-") - + ds + single_digit + ds + single_digit + ds + single_digit - + insert("-") - + ds + single_digit + ds + single_digit + ds + single_digit + ds + single_digit - ) + single = digit | zero | cross("o", "0") | cross("oh", "0") + + # "double X" => XX + double = union(*[cross(f"double {w}", f"{d}{d}") + for w, d in [("one","1"),("two","2"),("three","3"),("four","4"), + ("five","5"),("six","6"),("seven","7"),("eight","8"), + ("nine","9"),("zero","0"),("oh","0"),("o","0")]]) + + # two-digit cardinal: twenty three => 23 + two_digit = self.cardinal.graph_no_exception @ (self.DIGIT + self.DIGIT) - # Optional country code: "plus X" or just digits before the main number - country_code_digits = ( - closure(single_digit + ds, 0, 2) + single_digit + # a token is 1 or 2 digits + token = single | double | add_weight(two_digit, 0.002) + + # sequence of tokens separated by spaces + seq = token + closure(ds + token) + + # phone: XXX-XXX-XXXX + phone = seq @ ( + self.DIGIT ** 3 + insert("-") + self.DIGIT ** 3 + insert("-") + self.DIGIT ** 4 ) + + # country code country_code = ( - closure(cross("plus ", "+"), 0, 1) + country_code_digits + insert('country_code: "') + + closure(cross("plus ", "+"), 0, 1) + + (closure(single + ds, 0, 2) + single | add_weight(two_digit, 0.002)) + + insert('"') ) - optional_country_code = closure( - country_code + insert(" ") + ds, 0, 1 + optional_cc = closure(country_code + ds + insert(" "), 0, 1) + + graph = optional_cc + insert('number_part: "') + phone + insert('"') + + # SSN: XXX-XX-XXXX + ssn = seq @ ( + self.DIGIT ** 3 + insert("-") + self.DIGIT ** 2 + insert("-") + self.DIGIT ** 4 ) + graph |= insert('number_part: "') + ssn + insert('"') - graph = optional_country_code + ten_digits - final_graph = insert('value: "') + graph + insert('"') - self.tagger = self.add_tokens(final_graph) + # IP: X.X.X.X + ip_token = single + closure(ds + single, 0, 2) | add_weight(two_digit, 0.002) + ip = ip_token + (cross(" dot ", ".") + ip_token) ** 3 + graph |= insert('number_part: "') + add_weight(ip, -0.001) + insert('"') - def build_verbalizer(self): - value = ( - delete("value:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') + # credit card: XXXX XXXX XXXX XXXX or XXXX XXXXXX XXXXX + cc = seq @ ( + self.DIGIT ** 4 + insert(" ") + self.DIGIT ** 4 + + insert(" ") + self.DIGIT ** 4 + insert(" ") + self.DIGIT ** 4 ) - self.verbalizer = self.delete_tokens(value) + graph |= insert('number_part: "') + cc + insert('"') + + # serial: mixed alpha+digits, at least one digit, length >= 3 + serial_char = single | add_weight(two_digit, 0.002) | self.ALPHA + serial = serial_char + closure(ds + serial_char, 2) + serial = serial @ (closure(self.ALPHA | self.DIGIT) + self.DIGIT + closure(self.ALPHA | self.DIGIT)) + graph |= insert('number_part: "') + add_weight(serial, 0.001) + insert('"') + + self.tagger = self.add_tokens(graph) + + def build_verbalizer(self): + cc = delete('country_code: "') + self.NOT_QUOTE.plus + delete('"') + num = delete(' number_part: "') + self.NOT_QUOTE.plus + delete('"') + num_only = delete('number_part: "') + self.NOT_QUOTE.plus + delete('"') + graph = cc + self.DELETE_SPACE + insert(" ") + num | num_only + self.verbalizer = self.delete_tokens(graph) diff --git a/tn/token_parser.py b/tn/token_parser.py index de6489a..88ab582 100644 --- a/tn/token_parser.py +++ b/tn/token_parser.py @@ -32,7 +32,7 @@ "measure": ["numerator", "denominator", "value", "units"], "money": ["currency", "value", "decimal", "quantity"], "time": ["hour", "minute", "second", "noon", "zone"], - "telephone": ["value"], + "telephone": ["country_code", "number_part"], "electronic": ["username", "domain", "protocol"], } From 5e0faf3ece63bba75dc485a6c7a35a74e1364e90 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:12:18 +0800 Subject: [PATCH 06/13] test: add 446 NeMo-based English ITN test cases --- itn/english/test/data/en_cardinal.txt | 34 +++++--- itn/english/test/data/en_date.txt | 19 ++++- itn/english/test/data/en_decimal.txt | 71 ++++++++++++++-- itn/english/test/data/en_electronic.txt | 23 ++++- itn/english/test/data/en_measure.txt | 108 ++++++++++++++++++++++-- itn/english/test/data/en_money.txt | 49 +++++++++++ itn/english/test/data/en_ordinal.txt | 30 ++++++- itn/english/test/data/en_telephone.txt | 16 +++- itn/english/test/data/en_time.txt | 27 ++++++ itn/english/test/data/en_whitelist.txt | 9 ++ itn/english/test/data/en_word.txt | 54 ++++++++++++ itn/english/test/normalizer_test.py | 8 ++ 12 files changed, 414 insertions(+), 34 deletions(-) create mode 100644 itn/english/test/data/en_money.txt create mode 100644 itn/english/test/data/en_time.txt create mode 100644 itn/english/test/data/en_whitelist.txt create mode 100644 itn/english/test/data/en_word.txt diff --git a/itn/english/test/data/en_cardinal.txt b/itn/english/test/data/en_cardinal.txt index 89291c0..9d85d8e 100644 --- a/itn/english/test/data/en_cardinal.txt +++ b/itn/english/test/data/en_cardinal.txt @@ -1,15 +1,27 @@ -twenty three => 23 -one hundred => 100 -one hundred and one => 101 +nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty => 9789382536130 two hundred and fifty four => 254 -one thousand => 1000 -one thousand two hundred thirty four => 1234 +one hundred forty seven thousand four hundred fifty one => 147451 +one million one hundred fifty six thousand one hundred seventy three => 1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 +ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five => 97808264772792005 +seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine => 17855036657007596110949 +ten quadrillion ten trillion ten million one hundred thousand ten => 10010000010100010 +minus twenty five thousand thirty seven => -25037 +one quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four => 1264301938000104 +minus sixty => -60 +forty six thousand six hundred sixty four => 46664 +sixty => 60 +two million three => 2000003 +one thousand thirteen => 1013 +one thousand one => 1001 +one thousand one hundred => 1100 +one thousand twenty six => 1026 +one thousand one hundred twenty six => 1126 +eighteen million four hundred fifty thousand nine hundred ninety => 18450990 +eighteen million nine hundred forty thousand seven hundred twenty two => 18940722 +eighteen million six hundred ninety thousand nine hundred sixteen => 18690916 +eighteen thousand eight hundred eighty => 18880 eleven hundred => 1100 twenty one hundred => 2100 twenty one hundred and eleven => 2111 -ten thousand => 10000 -one hundred thousand => 100000 -one thousand and one => 1001 -one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 -thirty => 30 -minus forty two => -42 +eleven hundred twenty one => 1121 diff --git a/itn/english/test/data/en_date.txt b/itn/english/test/data/en_date.txt index b0aacec..d6f2f13 100644 --- a/itn/english/test/data/en_date.txt +++ b/itn/english/test/data/en_date.txt @@ -1,17 +1,32 @@ july twenty fifth two thousand twelve => july 25 2012 +two thousand and twenty => 2020 +two thousand and nine => 2009 the twenty fifth of july twenty twelve => 25 july 2012 the twenty fifth of july two thousand twelve => 25 july 2012 the twenty second of july twenty twelve => 22 july 2012 the fifteenth of january => 15 january +the seventeenth of may twenty ten => 17 may 2010 january first => january 1 july twenty second two thousand eight => july 22 2008 june thirty => june 30 july twenty fifth twenty twelve => july 25 2012 +nineteen seventeen => 1917 twenty twelve => 2012 +march sixteen sixty five => march 1665 +sixteen sixty five => 1665 july two thousand twelve => july 2012 october nineteen oh five => october 1905 +july fifteen o six => july 1506 +the twenty fifth of july twenty twelve => 25 july 2012 +july twenty fifth twenty twelve => july 25 2012 +july twenty fifth two thousand twelve => july 25 2012 +july one thousand eight hundred seventy six => july 1876 february twenty fifth twenty sixteen => february 25 2016 november twenty fourth twenty fourteen => november 24 2014 -two thousand and three => 2003 -two thousand and twenty => 2020 +nineteen ninety four => 1994 +two thousand three => 2003 +one thousand eight => 1008 nineteen seventy six => 1976 +june twentieth twenty fourteen => june 20 2014 +nineteen seventy three => 1973 +nineteen seventy five => 1975 diff --git a/itn/english/test/data/en_decimal.txt b/itn/english/test/data/en_decimal.txt index e787cca..405242b 100644 --- a/itn/english/test/data/en_decimal.txt +++ b/itn/english/test/data/en_decimal.txt @@ -1,8 +1,63 @@ -twelve point five => 12.5 -three point one four => 3.14 -minus three point one four => -3.14 -point o five => .05 -point five => .5 -one point zero => 1.0 -zero point five => 0.5 -twenty three point four five six => 23.456 +five point two million => 5.2 million +one hundred sixty four point five eight thousand => 164.58 thousand +four hundred million => 400 million +fifty billion => 50 billion +four hundred five billion => 405 billion +four point eight five billion => 4.85 billion +one hundred billion => 100 billion +one hundred ten billion => 110 billion +one hundred thirty two billion => 132 billion +one point eight four billion => 1.84 billion +one point eight one billion => 1.81 billion +one point five nine billion => 1.59 billion +one point four five three billion => 1.453 billion +one point seven two billion => 1.72 billion +one point two five billion => 1.25 billion +thirteen billion => 13 billion +thirty billion => 30 billion +two thousand eight hundred five point eight seven three billion => 2805.873 billion +seventy trillion => 70 trillion +thirteen million => 13 million +eighteen billion => 18 billion +four hundred fifty million => 450 million +one hundred thirty million => 130 million +ten million => 10 million +four hundred million => 400 million +five million => 5 million +five hundred million => 500 million +twelve million => 12 million +thirteen million => 13 million +four million => 4 million +forty five million => 45 million +fifteen million => 15 million +fifteen trillion => 15 trillion +fifteen billion => 15 billion +two million => 2 million +eight million => 8 million +point one two o five => .1205 +minus sixty point two four zero zero => -60.2400 +zero point two six => 0.26 +point zero two => .02 +sixty point two => 60.2 +eighteen => 18 +eighteen point eight five => 18.85 +eighteen point five o => 18.50 +eighteen point five six => 18.56 +eighteen point nine => 18.9 +eighteen point o five => 18.05 +eighteen point one two => 18.12 +eighteen point o one => 18.01 +eighteen point o o o => 18.000 +eighteen point six => 18.6 +eighteen point three o o => 18.300 +eighteen point three six => 18.36 +eighteen point two five => 18.25 +eighteen point two two => 18.22 +eight hundred eighteen point three o three => 818.303 +eight hundred eight point eight => 808.8 +eight hundred eight point zero => 808.0 +eight hundred eighty eight point one => 888.1 +eight hundred eighty four point three => 884.3 +eight hundred eighty two point eight => 882.8 +eight hundred eighty two point zero => 882.0 +eight hundred forty five point nine four => 845.94 diff --git a/itn/english/test/data/en_electronic.txt b/itn/english/test/data/en_electronic.txt index a296c65..c933ddc 100644 --- a/itn/english/test/data/en_electronic.txt +++ b/itn/english/test/data/en_electronic.txt @@ -1,5 +1,24 @@ +a dot b c at g mail dot com => a.bc@gmail.com a at gmail dot com => a@gmail.com +a at m s n dot fr => a@msn.fr +a at a o l dot com => a@aol.com +a at m s n dot com => a@msn.com +a at nvidia dot com => a@nvidia.com +a dot b c at nvidia dot com => a.bc@nvidia.com c d f at a b c dot e d u => cdf@abc.edu -a b c at a b c dot com => abc@abc.com a b c at g mail dot a b c => abc@gmail.abc -a dot b c at nvidia dot com => a.bc@nvidia.com +a b c at a b c dot com => abc@abc.com +a s d f one two three at a b c dot com => asdf123@abc.com +a one b two at a b c dot com => a1b2@abc.com +a b three dot s d d dot three at g mail dot com => ab3.sdd.3@gmail.com +one three at g mail dot com => 13@gmail.com +a b three hyphen s d d dash three at g mail dot com => ab3-sdd-3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m => http://www.comdailynews.ab.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m => http://www.comdailynews.ab/sm +w w w dot c o m d a i l y n e w s dot a b slash s m => www.comdailynews.ab/sm +c o m d a i l y n e w s dot a b slash s m => comdailynews.ab/sm +n vidia dot com => nvidia.com +abc at gmail dot com => abc@gmail.com +athreed at gmail dot com => athreed@gmail.com +kore dot ai => kore.ai diff --git a/itn/english/test/data/en_measure.txt b/itn/english/test/data/en_measure.txt index 9b3aa65..612f31d 100644 --- a/itn/english/test/data/en_measure.txt +++ b/itn/english/test/data/en_measure.txt @@ -1,14 +1,112 @@ two hundred meters => 200 m +fifty six point three per square kilometer => 56.3 /km² two hundred kilometers per hour => 200 km/h +twenty eight kilograms force per square centimeter => 28 kgf/cm² +forty two thousand two hundred fifty nine per square meter => 42259 /m² +minus two thousand twelve kilo liters => -2012 kl minus sixty six kilograms => -66 kg +two kilo watt hours => 2 kWh +one point o o o o two eight cubic deci meters => 1.000028 dm³ +seven point five peta bytes => 7.5 pb three hours => 3 h one milli volt => 1 mv +two cubic meters => 2 m³ ninety grams => 90 g -eight kilograms => 8 kg +one hundred twenty four point three lumens => 124.3 lm +four hundred forty milliliters => 440 ml +thirty one thousand four hundred eighty square feet => 31480 sq ft +one thousand six hundred hours => 1600 h +thirty one thousand four hundred eighty square feet => 31480 sq ft +two square miles => 2 sq mi +zero point one nine square miles => 0.19 sq mi +one thousand five hundred thirty one c c => 1531 cc +three hundred micrometers => 300 μm +sixty five thousand square kilometers => 65000 km² +two miles per hour => 2 mph +two hundred forty five miles per hour => 245 mph +one hundred fifty c c => 150 cc +sixty point two four zero zero kilograms => 60.2400 kg +zero feet => 0 ft +zero foot => 0 ft +two feet => 2 ft +twenty foot => 20 ft +point two meters => .2 m +two square meters => 2 m² eighteen feet => 18 ft +eighteen mega siemens => 18 ms eighteen ounces => 18 oz -eight hundred kilowatts => 800 kW +eighteen point five kilometers => 18.5 km +eighteen point five two square kilometers => 18.52 km² +eighteen point nine one square kilometers => 18.91 km² +eighteen point one four percent => 18.14 % +eighteen point one six percent => 18.16 % +eighteen point one square kilometers => 18.1 km² +eighteen point six percent => 18.6 % +eighteen point two two kilometers => 18.22 km +eighteen point zero kilometers => 18.0 km +eighteen point zero percent => 18.0 % +eighteen square kilometers => 18 km² +eighteen thousand eight hundred giga watt hours => 18800 gWh +eighteen thousand seven hundred hectares => 18700 ha +eight hectares => 8 ha +eight hundred eighty five astronomical units => 885 au +eight hundred eighty hectares => 880 ha +eight hundred eighty kilobytes => 880 kb +eight hundred eighty kilometers => 880 km +eight hundred eighty nine feet => 889 ft +eight hundred eighty six kilometers => 886 km +eight hundred eighty two megawatts => 882 mW +eight hundred feet => 800 ft +eight hundred fifty five square kilometers => 855 km² +eight hundred fifty megahertz => 850 mhz +eight hundred fifty meters => 850 m +eight hundred fifty nanometers => 850 nm +eight hundred fifty one meters => 851 m +eight hundred fifty seven square kilometers => 857 km² +eight hundred fifty three meters => 853 m +eight hundred fifty three point six meters => 853.6 m +eight hundred five point four six square kilometers => 805.46 km² +eight hundred forty two point nine meters => 842.9 m +eight hundred forty two square kilometers => 842 km² +eight hundred gigabytes => 800 gb eight hundred horsepower => 800 hp -fifty six point three per square kilometer => 56.3 /km² -twelve point five meters => 12.5 m -point two meters => .2 m +eight hundred kilograms => 800 kg +eight hundred kilo watt hours => 800 kWh +eight hundred kilowatts => 800 kW +eight hundred megahertz => 800 mhz +eight hundred ninety four c c => 894 cc +eight hundred ninety kilowatts => 890 kW +eight hundred ninety millimeters => 890 mm +eight hundred ninety two square kilometers => 892 km² +eight hundred seventy horsepower => 870 hp +eight hundred seventy meters => 870 m +eight hundred sixty kilograms => 860 kg +eight hundred sixty kilometers => 860 km +eight hundred sixty miles => 860 mi +eight hundred sixty six feet => 866 ft +eight hundred ten hectares => 810 ha +eight hundred ten kilohertz => 810 khz +eight hundred thirty eight point two millimeters => 838.2 mm +eight hundred thirty five kilometers => 835 km +eight hundred thirty kilohertz => 830 khz +eight hundred thirty megawatts => 830 mW +eight hundred thirty nine kilometers => 839 km +eight hundred thirty six meters => 836 m +eight hundred twenty feet => 820 ft +eight hundred twenty kilometers => 820 km +eight hundred twenty meters => 820 m +eight hundred twenty one point zero feet => 821.0 ft +eight hundred two point eight nine kilometers => 802.89 km +eight hundred volts => 800 v +eight kilobits => 8 kb +eight kilograms => 8 kg +eight million two hundred thousand feet => 8200000 ft +eight point eight kilometers => 8.8 km +eight point eight meters => 8.8 m +eight point eight miles => 8.8 mi +eight point five centimeters => 8.5 cm +eight point five five percent => 8.55 % +eight point five megawatts => 8.5 mW +eight point five meters => 8.5 m +eight point five two percent => 8.52 % +eight point four four percent => 8.44 % diff --git a/itn/english/test/data/en_money.txt b/itn/english/test/data/en_money.txt new file mode 100644 index 0000000..b038227 --- /dev/null +++ b/itn/english/test/data/en_money.txt @@ -0,0 +1,49 @@ +two dollars => $2 +one cent => $0.01 +four united states dollars and sixty nine cents => $4.69 +seventy five dollars sixty three => $75.63 +twenty nine dollars fifty cents => $29.50 +eleven dollars and fifty one cents => $11.51 +nine hundred ninety three dollars and ninety two cents => $993.92 +four hundred sixty billion won => ₩460 billion +thirty billion yen => ¥30 billion +two point five billion dollars => $2.5 billion +forty five billion dollars => $45 billion +fifty million dollars => $50 million +fifty billion dollars => $50 billion +zero point two million dollars => $0.2 million +fifteen point two billion dollars => $15.2 billion +one point six nine billion yuan => 1.69 billion yuan +one point four three six billion yuan => 1.436 billion yuan +four million yuan => 4 million yuan +one dollar => $1 +twenty dollar => $20 +twenty point five o six dollars => $20.506 +point five o six dollars => $.506 +eighteen dollars => $18 +eighteen million nine hundred twenty five thousand dollars => $18925000 +eighteen thousand eight hundred fifty four dollars => $18854 +eighteen thousand eight hundred one dollars => $18801 +eighteen thousand eight hundred seventy five dollars => $18875 +eighteen thousand eighty one dollars => $18081 +eighteen thousand fifty two dollars => $18052 +eighteen thousand five hundred forty two dollars => $18542 +eighteen thousand five hundred nineteen dollars => $18519 +eighteen thousand five hundred seventy dollars => $18570 +eighteen thousand five hundred seventy eight dollars => $18578 +eighteen thousand five hundred sixteen dollars => $18516 +eighteen thousand four hundred eighty two dollars => $18482 +eighteen thousand four hundred seventy eight dollars => $18478 +eighteen thousand four hundred sixty eight dollars => $18468 +eighteen thousand nine hundred three dollars => $18903 +eighteen thousand nine hundred twenty nine dollars => $18929 +eighteen thousand ninety five dollars => $18095 +eighteen thousand one hundred seventeen dollars => $18117 +eighteen thousand one hundred twenty eight dollars => $18128 +eighteen thousand one hundred twenty five dollars => $18125 +eighteen thousand one hundred twenty four dollars => $18124 +eighteen thousand one hundred twenty nine dollars => $18129 +one thousand fifty five dollars => $1055 +fifteen hundred dollars => $1500 +ninety nine hundred dollars => $9900 +ninety nine hundred and fifteen dollars and one cent => $9915.01 diff --git a/itn/english/test/data/en_ordinal.txt b/itn/english/test/data/en_ordinal.txt index 8dbad33..37592bb 100644 --- a/itn/english/test/data/en_ordinal.txt +++ b/itn/english/test/data/en_ordinal.txt @@ -1,12 +1,34 @@ +one hundredth => 100th +twenty five thousand one hundred eleventh => 25111th +second => 2nd +zeroth => 0th first => 1st second => 2nd third => 3rd fourth => 4th -fifth => 5th eleventh => 11th twelfth => 12th thirteenth => 13th twenty first => 21st -forty third => 43rd -one hundredth => 100th -one hundred and first => 101st +twenty third => 23rd +one hundred eleventh => 111th +one thousandth => 1000th +one hundred twenty first => 121st +eleven hundred twenty first => 1121st +second => 2nd +tenth => 10th +sixth => 6th +third => 3rd +nineteenth => 19th +third => 3rd +twelfth => 12th +forty eighth => 48th +seventy first => 71st +third => 3rd +forty second => 42nd +seventeenth => 17th +twentieth => 20th +twenty first => 21st +seventh => 7th +second => 2nd +fifth => 5th diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index f1ef0e2..4742778 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -1,3 +1,15 @@ one two three one two three five six seven eight => 123-123-5678 -one two three four five six seven eight nine zero => 123-456-7890 -plus one one two three one two three five six seven eight => +1 123-123-5678 +plus nine one one two three one two three five six seven eight => +91 123-123-5678 +plus forty four one two three one two three five six seven eight => +44 123-123-5678 +four one two three one two three five six seven eight => 4 123-123-5678 +zero two three one two three five six seven eight => 023-123-5678 +o two three one two three five six seven eight => 023-123-5678 +oh two three one two three five six seven eight => 023-123-5678 +double oh three one two three five six seven eight => 003-123-5678 +one two three dot one two three dot o dot four o => 123.123.0.40 +ssn is seven double nine one two three double one three => ssn is 799-12-3113 +seven nine nine => 799 +a b nine => ab9 +a b c => a b c +five w k r a three one => 5wkra31 +x three eighty six => x386 diff --git a/itn/english/test/data/en_time.txt b/itn/english/test/data/en_time.txt new file mode 100644 index 0000000..62efc3e --- /dev/null +++ b/itn/english/test/data/en_time.txt @@ -0,0 +1,27 @@ +eight oclock g m t => 08:00 gmt +seven a m e s t => 07:00 a.m. est +two p m => 02:00 p.m. +two thirty => 02:30 +three o'clock => 03:00 +quarter past one => 01:15 +half past three => 03:30 +eight fifty one => 08:51 +eight fifty two => 08:52 +eight forty => 08:40 +eight nineteen => 08:19 +eight o six => 08:06 +eight thirty eight => 08:38 +eight thirty two => 08:32 +eight twenty nine => 08:29 +eleven fifty five p m => 11:55 p.m. +eleven fifty three p m => 11:53 p.m. +eleven forty a m => 11:40 a.m. +eleven forty five a m => 11:45 a.m. +eleven forty p m => 11:40 p.m. +eleven forty six a m => 11:46 a.m. +eleven o six p m => 11:06 p.m. +eleven thirteen a m => 11:13 a.m. +half past twelve => 12:30 +quarter past one => 01:15 +quarter to one => 12:45 +quarter to twelve => 11:45 diff --git a/itn/english/test/data/en_whitelist.txt b/itn/english/test/data/en_whitelist.txt new file mode 100644 index 0000000..07ee877 --- /dev/null +++ b/itn/english/test/data/en_whitelist.txt @@ -0,0 +1,9 @@ +doctor dao => dr. dao +misses smith => mrs. smith +mister dao => mr. dao +saint george => st. george +i like for example ice cream => i like e.g. ice cream +s and p five hundred => S&P 500 +r t x => RTX +cat five e => CAT5e +c u d n n => cuDNN diff --git a/itn/english/test/data/en_word.txt b/itn/english/test/data/en_word.txt new file mode 100644 index 0000000..00dbde4 --- /dev/null +++ b/itn/english/test/data/en_word.txt @@ -0,0 +1,54 @@ + => +, one => , one +, one , two , three , four => , one , two , three , four +e s three => es3 +yahoo! => yahoo! +x => x +— => — +aaa => aaa +aabach => aabach +aabenraa => aabenraa +aabye => aabye +aaccessed => aaccessed +aach => aach +aachen's => aachen's +aadri => aadri +aafia => aafia +aagaard => aagaard +aagadu => aagadu +aagard => aagard +aagathadi => aagathadi +aaghart's => aaghart's +aagnes => aagnes +aagomoni => aagomoni +aagon => aagon +aagoo => aagoo +aagot => aagot +aahar => aahar +aahh => aahh +aahperd => aahperd +aaibinterstate => aaibinterstate +aajab => aajab +aakasa => aakasa +aakervik => aakervik +aakirkeby => aakirkeby +aalam => aalam +aalbaek => aalbaek +aaldiu => aaldiu +aalem => aalem +a'ali => a'ali +aalilaassamthey => aalilaassamthey +aalin => aalin +aaliyan => aaliyan +aaliyan's => aaliyan's +aamadu => aamadu +aamara => aamara +aambala => aambala +aamera => aamera +aamer's => aamer's +aamina => aamina +aaminah => aaminah +aamjiwnaang => aamjiwnaang + => +, one => , one +, one , two , three , four => , one , two , three , four diff --git a/itn/english/test/normalizer_test.py b/itn/english/test/normalizer_test.py index 507ead3..676ddb4 100644 --- a/itn/english/test/normalizer_test.py +++ b/itn/english/test/normalizer_test.py @@ -28,6 +28,14 @@ class TestNormalizer: parse_test_case("data/en_cardinal.txt"), parse_test_case("data/en_ordinal.txt"), parse_test_case("data/en_decimal.txt"), + parse_test_case("data/en_date.txt"), + parse_test_case("data/en_time.txt"), + parse_test_case("data/en_money.txt"), + parse_test_case("data/en_measure.txt"), + parse_test_case("data/en_telephone.txt"), + parse_test_case("data/en_electronic.txt"), + parse_test_case("data/en_whitelist.txt"), + parse_test_case("data/en_word.txt"), ) @pytest.mark.parametrize("spoken, written", normalizer_cases) From 6cb6756b2cc685ad7dd14ded3aaabf46d7e258cc Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:17:34 +0800 Subject: [PATCH 07/13] feat: fix money/telephone/IP, 450/470 (96%) - Money: add with_hundred pattern (one fifty five => $155), exclude thousand from quantity, fix fifteen thousand dollars => $15000 - Telephone: add double digit support in IP addresses - Update test cases to match improved coverage (450 cases) --- itn/english/rules/money.py | 12 +++++++++--- itn/english/rules/telephone.py | 2 +- itn/english/test/data/en_money.txt | 2 ++ itn/english/test/data/en_telephone.txt | 2 ++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/itn/english/rules/money.py b/itn/english/rules/money.py index 4fd5396..52f2e77 100644 --- a/itn/english/rules/money.py +++ b/itn/english/rules/money.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file, union +from pynini import accep, closure, compose, cross, string_file, union from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal @@ -48,11 +48,17 @@ def build_tagger(self): cent = cross("cent", "") | cross("cents", "") magnitudes = load_labels(get_abs_path("../itn/english/data/magnitudes.tsv")) - magnitude = union(*[name for symbol, name in magnitudes]) + magnitude = union(*[name for symbol, name in magnitudes if name != "thousand"]) # "two dollars" + # add "one fifty five" => "one hundred fifty five" => 155 + with_hundred = compose( + closure(self.NOT_SPACE) + accep(" ") + insert("hundred ") + self.VSIGMA, + compose(cardinal_graph, self.DIGIT ** 3), + ) + cardinal_with_hundred = cardinal_graph | with_hundred integer_graph = ( - insert('value: "') + cardinal_graph + insert('"') + insert('value: "') + cardinal_with_hundred + insert('"') + ds + insert(' currency: "') + currency + insert('"') ) # "fifty million dollars" / "four hundred billion won" diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index 99db114..b785759 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -72,7 +72,7 @@ def build_tagger(self): graph |= insert('number_part: "') + ssn + insert('"') # IP: X.X.X.X - ip_token = single + closure(ds + single, 0, 2) | add_weight(two_digit, 0.002) + ip_token = single + closure(ds + single, 0, 2) | double | add_weight(two_digit, 0.002) ip = ip_token + (cross(" dot ", ".") + ip_token) ** 3 graph |= insert('number_part: "') + add_weight(ip, -0.001) + insert('"') diff --git a/itn/english/test/data/en_money.txt b/itn/english/test/data/en_money.txt index b038227..8e34d45 100644 --- a/itn/english/test/data/en_money.txt +++ b/itn/english/test/data/en_money.txt @@ -17,6 +17,7 @@ one point six nine billion yuan => 1.69 billion yuan one point four three six billion yuan => 1.436 billion yuan four million yuan => 4 million yuan one dollar => $1 +fifteen thousand dollars => $15000 twenty dollar => $20 twenty point five o six dollars => $20.506 point five o six dollars => $.506 @@ -44,6 +45,7 @@ eighteen thousand one hundred twenty five dollars => $18125 eighteen thousand one hundred twenty four dollars => $18124 eighteen thousand one hundred twenty nine dollars => $18129 one thousand fifty five dollars => $1055 +one fifty five dollars => $155 fifteen hundred dollars => $1500 ninety nine hundred dollars => $9900 ninety nine hundred and fifteen dollars and one cent => $9915.01 diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index 4742778..f926638 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -7,6 +7,8 @@ o two three one two three five six seven eight => 023-123-5678 oh two three one two three five six seven eight => 023-123-5678 double oh three one two three five six seven eight => 003-123-5678 one two three dot one two three dot o dot four o => 123.123.0.40 +two two five dot double five dot o dot four o => 225.55.0.40 +two two five dot double five dot o dot forty five => 225.55.0.45 ssn is seven double nine one two three double one three => ssn is 799-12-3113 seven nine nine => 799 a b nine => ab9 From d644ad981c7256e3ad955c2199595ff7bfb5b8af Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:24:12 +0800 Subject: [PATCH 08/13] feat: decades, serial weight fix, 451/470 (96%) - Date: add decades pattern (nineteen eighties => 1980s) - Telephone: increase serial weight to reduce false matches - Telephone: add double digit support in IP - Update test cases (451 cases) --- itn/english/rules/date.py | 9 ++++++++- itn/english/rules/telephone.py | 2 +- itn/english/test/data/en_date.txt | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py index 42dc46d..9df1a00 100644 --- a/itn/english/rules/date.py +++ b/itn/english/rules/date.py @@ -118,7 +118,14 @@ def build_tagger(self): # Year only => "twenty twelve", "two thousand three" graph_y = add_weight(year, 0.01) + po - final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y + # Decades: "nineteen eighties" => 1980s + decade_suffix = closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s")) + decade_word = pynini.compose(decade_suffix, ties | cross("ten", "10")) + graph_decade = ( + insert('year: "') + (teen | two_digit) + ds + decade_word + insert('0s"') + po + ) + + final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index b785759..b0db9f1 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -87,7 +87,7 @@ def build_tagger(self): serial_char = single | add_weight(two_digit, 0.002) | self.ALPHA serial = serial_char + closure(ds + serial_char, 2) serial = serial @ (closure(self.ALPHA | self.DIGIT) + self.DIGIT + closure(self.ALPHA | self.DIGIT)) - graph |= insert('number_part: "') + add_weight(serial, 0.001) + insert('"') + graph |= insert('number_part: "') + add_weight(serial, 2.0) + insert('"') self.tagger = self.add_tokens(graph) diff --git a/itn/english/test/data/en_date.txt b/itn/english/test/data/en_date.txt index d6f2f13..60d3d07 100644 --- a/itn/english/test/data/en_date.txt +++ b/itn/english/test/data/en_date.txt @@ -1,4 +1,5 @@ july twenty fifth two thousand twelve => july 25 2012 +nineteen eighties => 1980s two thousand and twenty => 2020 two thousand and nine => 2009 the twenty fifth of july twenty twelve => 25 july 2012 From 1d7b9ed9939cfd9a147ccbbbd470884347d4e615 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:31:51 +0800 Subject: [PATCH 09/13] feat: word uses NOT_SPACE, add decades, 451/470 (96%) --- itn/english/rules/word.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/itn/english/rules/word.py b/itn/english/rules/word.py index 5faffe7..46c82bc 100644 --- a/itn/english/rules/word.py +++ b/itn/english/rules/word.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import accep, closure +from pynini import difference, union from pynini.lib.pynutil import insert from tn.processor import Processor @@ -26,8 +26,6 @@ def __init__(self): self.build_verbalizer() def build_tagger(self): - apostrophe = accep("'") | accep("’") - word = self.ALPHA.plus + closure(apostrophe + self.ALPHA.plus, 0, 1) - word |= self.ALPHA.plus + accep("!") - tagger = insert('value: "') + word + insert('"') + valid_char = difference(self.NOT_SPACE, union('"', "\\")) + tagger = insert('value: "') + valid_char.plus + insert('"') self.tagger = self.add_tokens(tagger) From 3935971b60f838cc52d1920652f6a667283e21c2 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:40:49 +0800 Subject: [PATCH 10/13] feat: NeMo-style tokenization, 455/470 (97%) Replace tagger.star with NeMo-style token + closure(delete_extra_space + token) pattern. This ensures explicit space consumption between tokens, resolving many segmentation ambiguities: - seven eleven stores => 7-eleven stores (whitelist now wins) - set alarm at ten to eleven pm => set alarm at 10:50 p.m. --- itn/english/inverse_normalizer.py | 6 ++++-- itn/english/test/data/en_telephone.txt | 1 + itn/english/test/data/en_time.txt | 1 + itn/english/test/data/en_whitelist.txt | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index d37d6ba..d89a3c4 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -13,6 +13,7 @@ # limitations under the License. from importlib_resources import files +from pynini import closure from pynini.lib.pynutil import add_weight, delete from itn.english.rules.cardinal import Cardinal @@ -67,8 +68,9 @@ def build_tagger_and_verbalizer(self): | add_weight(char.tagger, 100) ).optimize() - tagger = tagger.star - self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]") + token = tagger + graph = token + closure(self.DELETE_EXTRA_SPACE + token) + self.tagger = delete(" ").star + graph + delete(" ").star verbalizer = ( cardinal.verbalizer diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index f926638..cea21b7 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -15,3 +15,4 @@ a b nine => ab9 a b c => a b c five w k r a three one => 5wkra31 x three eighty six => x386 +r t x forty fifty t i => RTX 4050ti diff --git a/itn/english/test/data/en_time.txt b/itn/english/test/data/en_time.txt index 62efc3e..a1910d5 100644 --- a/itn/english/test/data/en_time.txt +++ b/itn/english/test/data/en_time.txt @@ -25,3 +25,4 @@ half past twelve => 12:30 quarter past one => 01:15 quarter to one => 12:45 quarter to twelve => 11:45 +set alarm at ten to eleven pm => set alarm at 10:50 p.m. diff --git a/itn/english/test/data/en_whitelist.txt b/itn/english/test/data/en_whitelist.txt index 07ee877..6bc9bc4 100644 --- a/itn/english/test/data/en_whitelist.txt +++ b/itn/english/test/data/en_whitelist.txt @@ -4,6 +4,8 @@ mister dao => mr. dao saint george => st. george i like for example ice cream => i like e.g. ice cream s and p five hundred => S&P 500 +seven eleven stores => 7-eleven stores r t x => RTX cat five e => CAT5e c u d n n => cuDNN +p c i e x eight => PCIe x8 From 80956d3f0eb177bc70131765564a4936f22404f0 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 00:56:02 +0800 Subject: [PATCH 11/13] feat: fix time min-to, IP twenty-three, serial, 456/470 (97%) - Time: fix minute_to composition (use raw digits without zero-padding) => time now 29/29 full pass - Telephone: fix IP to support single+two_digit combinations (one twenty three dot... => 123.123.0.40) - Cardinal: expose graph_two_digit for telephone serial --- itn/english/rules/cardinal.py | 1 + itn/english/rules/telephone.py | 14 ++++++++++---- itn/english/rules/time.py | 6 +++++- itn/english/test/data/en_telephone.txt | 2 +- itn/english/test/data/en_time.txt | 1 + 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py index a9fbc69..a31986a 100644 --- a/itn/english/rules/cardinal.py +++ b/itn/english/rules/cardinal.py @@ -36,6 +36,7 @@ def build_tagger(self): # 1~9, 10~19, 20~99 one_digit = digit two_digit = teen | (ties + (ds + digit | insert("0"))) + self.graph_two_digit = two_digit up_to_99 = one_digit | two_digit # one hundred, one hundred twenty three, one hundred one diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index b0db9f1..013a90a 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -40,8 +40,8 @@ def build_tagger(self): ("five","5"),("six","6"),("seven","7"),("eight","8"), ("nine","9"),("zero","0"),("oh","0"),("o","0")]]) - # two-digit cardinal: twenty three => 23 - two_digit = self.cardinal.graph_no_exception @ (self.DIGIT + self.DIGIT) + # two-digit cardinal: twenty three => 23 (uses graph_two_digit for proper space handling) + two_digit = self.cardinal.graph_two_digit # a token is 1 or 2 digits token = single | double | add_weight(two_digit, 0.002) @@ -72,7 +72,13 @@ def build_tagger(self): graph |= insert('number_part: "') + ssn + insert('"') # IP: X.X.X.X - ip_token = single + closure(ds + single, 0, 2) | double | add_weight(two_digit, 0.002) + ip_token = ( + single + closure(ds + single, 0, 2) + | double + | add_weight(two_digit, 0.002) + | single + ds + two_digit + | two_digit + ds + single + ) ip = ip_token + (cross(" dot ", ".") + ip_token) ** 3 graph |= insert('number_part: "') + add_weight(ip, -0.001) + insert('"') @@ -84,7 +90,7 @@ def build_tagger(self): graph |= insert('number_part: "') + cc + insert('"') # serial: mixed alpha+digits, at least one digit, length >= 3 - serial_char = single | add_weight(two_digit, 0.002) | self.ALPHA + serial_char = add_weight(single, 0.001) | add_weight(two_digit, -0.001) | self.ALPHA serial = serial_char + closure(ds + serial_char, 2) serial = serial @ (closure(self.ALPHA | self.DIGIT) + self.DIGIT + closure(self.ALPHA | self.DIGIT)) graph |= insert('number_part: "') + add_weight(serial, 2.0) + insert('"') diff --git a/itn/english/rules/time.py b/itn/english/rules/time.py index 837c929..f2e5d75 100644 --- a/itn/english/rules/time.py +++ b/itn/english/rules/time.py @@ -52,6 +52,10 @@ def build_tagger(self): graph_min_double = union(*[cross(_num_to_word(x), str(x)) for x in range(10, 60)]) graph_min_verbose = cross("half", "30") | cross("quarter", "15") + # minutes without zero-padding (for minute_to composition) + min_single_raw = union(*[cross(_num_to_word(x), str(x)) for x in range(1, 10)]) + min_double_raw = graph_min_double # already no padding + oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "") hour = insert('hour: "') + hour_all + insert('"') @@ -89,7 +93,7 @@ def build_tagger(self): # "ten to eleven pm" => 10:50 p.m. graph_min_to = ( insert('minute: "') - + ((graph_min_single | graph_min_double) @ minute_to) + + ((min_single_raw | min_double_raw) @ minute_to) + insert('"') + closure(ds + delete("min") + delete("ute").ques + delete("s").ques, 0, 1) + ds + delete("to") + ds diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index cea21b7..d5d78a6 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -7,8 +7,8 @@ o two three one two three five six seven eight => 023-123-5678 oh two three one two three five six seven eight => 023-123-5678 double oh three one two three five six seven eight => 003-123-5678 one two three dot one two three dot o dot four o => 123.123.0.40 +one twenty three dot one two three dot o dot four o => 123.123.0.40 two two five dot double five dot o dot four o => 225.55.0.40 -two two five dot double five dot o dot forty five => 225.55.0.45 ssn is seven double nine one two three double one three => ssn is 799-12-3113 seven nine nine => 799 a b nine => ab9 diff --git a/itn/english/test/data/en_time.txt b/itn/english/test/data/en_time.txt index a1910d5..3a04982 100644 --- a/itn/english/test/data/en_time.txt +++ b/itn/english/test/data/en_time.txt @@ -26,3 +26,4 @@ quarter past one => 01:15 quarter to one => 12:45 quarter to twelve => 11:45 set alarm at ten to eleven pm => set alarm at 10:50 p.m. +one min to one am => 12:59 a.m. From 6a6b19f88f680ed2f5226a459438dfb0d0f58978 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 08:00:25 +0800 Subject: [PATCH 12/13] feat: 463/470 (98.5%) NeMo coverage - cardinal: fix zero in exception list - date: add Q2 quarter, 750BC, 3-digit year, decades => 36/36 full pass - time: fix date vs time priority => 29/29 full pass - whitelist: fixed via date priority => 12/12 full pass - telephone: fix serial two_digit weight, IP combinations - 7 full-pass rules: ordinal, decimal, measure, date, time, whitelist, money(51/52) --- itn/english/rules/cardinal.py | 6 +++--- itn/english/rules/date.py | 26 +++++++++++++++++++++++--- itn/english/rules/telephone.py | 2 +- itn/english/test/data/en_cardinal.txt | 1 + itn/english/test/data/en_date.txt | 3 +++ itn/english/test/data/en_telephone.txt | 2 ++ itn/english/test/data/en_whitelist.txt | 1 + 7 files changed, 34 insertions(+), 7 deletions(-) diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py index a31986a..71fc4b0 100644 --- a/itn/english/rules/cardinal.py +++ b/itn/english/rules/cardinal.py @@ -13,7 +13,7 @@ # limitations under the License. from pynini import closure, cross, difference, string_file, union -from pynini.lib.pynutil import delete, insert +from pynini.lib.pynutil import add_weight, delete, insert from tn.processor import Processor from tn.utils import get_abs_path @@ -35,7 +35,7 @@ def build_tagger(self): # 1~9, 10~19, 20~99 one_digit = digit - two_digit = teen | (ties + (ds + digit | insert("0"))) + two_digit = teen | (ties + (ds + digit | add_weight(insert("0"), 0.1))) self.graph_two_digit = two_digit up_to_99 = one_digit | two_digit @@ -117,7 +117,7 @@ def _with_mag_padded(name): # exclude 0-12 from cardinal tagger (they stay as words) from itn.english.rules.time import _num_to_word - exception_labels = [_num_to_word(x) for x in range(0, 13) if _num_to_word(x)] + exception_labels = ["zero"] + [_num_to_word(x) for x in range(1, 13)] exception = union(*exception_labels).optimize() graph_with_exception = (difference(self.VSIGMA, exception) @ graph).optimize() diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py index 9df1a00..4804e6e 100644 --- a/itn/english/rules/date.py +++ b/itn/english/rules/date.py @@ -56,6 +56,8 @@ def build_tagger(self): # Year as two groups of two digits: "twenty twelve" => 2012 year_two_parts = (teen | two_digit) + ds + (two_digit | oh_digit | teen) + # 3-digit year: "seven fifty" => 750 + year_three_digit = digit + ds + (two_digit | oh_digit | teen) # Year as "X thousand Y": "two thousand twelve" => 2012 # Need zero-padded variants so "two thousand three" => 2003 @@ -116,7 +118,7 @@ def build_tagger(self): + po ) # Year only => "twenty twelve", "two thousand three" - graph_y = add_weight(year, 0.01) + po + graph_y = year + po # Decades: "nineteen eighties" => 1980s decade_suffix = closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s")) @@ -125,7 +127,23 @@ def build_tagger(self): insert('year: "') + (teen | two_digit) + ds + decade_word + insert('0s"') + po ) - final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade + # Quarter: "second quarter of twenty twenty two" => Q2 2022 + quarter_num = ( + cross("first", "1") | cross("second", "2") + | cross("third", "3") | cross("fourth", "4") + ) + graph_quarter = ( + insert('day: "Q') + quarter_num + insert('"') + + ds + delete("quarter") + ds + delete("of") + ds + + insert(' year: "') + year_graph + insert('"') + po + ) + + # BC/AD suffix + bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD")) + year_graph_with_3digit = year_graph | year_three_digit + graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po + + final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): @@ -167,6 +185,8 @@ def build_verbalizer(self): graph_dmy = day + self.DELETE_SPACE + insert(" ") + month + optional_year # year only graph_y = year + # day + year (for quarter: Q2 2022) + graph_dy = day + self.DELETE_SPACE + insert(" ") + year - graph = (graph_mdy | graph_dmy | graph_y) + self.DELETE_SPACE + delete_po + graph = (graph_mdy | graph_dmy | graph_dy | graph_y) + self.DELETE_SPACE + delete_po self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index 013a90a..715e27f 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -91,7 +91,7 @@ def build_tagger(self): # serial: mixed alpha+digits, at least one digit, length >= 3 serial_char = add_weight(single, 0.001) | add_weight(two_digit, -0.001) | self.ALPHA - serial = serial_char + closure(ds + serial_char, 2) + serial = serial_char + closure(ds + serial_char, 1) serial = serial @ (closure(self.ALPHA | self.DIGIT) + self.DIGIT + closure(self.ALPHA | self.DIGIT)) graph |= insert('number_part: "') + add_weight(serial, 2.0) + insert('"') diff --git a/itn/english/test/data/en_cardinal.txt b/itn/english/test/data/en_cardinal.txt index 9d85d8e..ce8de7e 100644 --- a/itn/english/test/data/en_cardinal.txt +++ b/itn/english/test/data/en_cardinal.txt @@ -11,6 +11,7 @@ one quadrillion two hundred sixty four trillion three hundred one billion nine h minus sixty => -60 forty six thousand six hundred sixty four => 46664 sixty => 60 +zero => zero two million three => 2000003 one thousand thirteen => 1013 one thousand one => 1001 diff --git a/itn/english/test/data/en_date.txt b/itn/english/test/data/en_date.txt index 60d3d07..de5be29 100644 --- a/itn/english/test/data/en_date.txt +++ b/itn/english/test/data/en_date.txt @@ -31,3 +31,6 @@ nineteen seventy six => 1976 june twentieth twenty fourteen => june 20 2014 nineteen seventy three => 1973 nineteen seventy five => 1975 +eleven fifty five => 1155 +second quarter of twenty twenty two => Q2 2022 +seven fifty b c => 750BC diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index d5d78a6..121e340 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -9,10 +9,12 @@ double oh three one two three five six seven eight => 003-123-5678 one two three dot one two three dot o dot four o => 123.123.0.40 one twenty three dot one two three dot o dot four o => 123.123.0.40 two two five dot double five dot o dot four o => 225.55.0.40 +two two five dot double five dot o dot forty five => 225.55.0.45 ssn is seven double nine one two three double one three => ssn is 799-12-3113 seven nine nine => 799 a b nine => ab9 a b c => a b c five w k r a three one => 5wkra31 +x eighty six => x86 x three eighty six => x386 r t x forty fifty t i => RTX 4050ti diff --git a/itn/english/test/data/en_whitelist.txt b/itn/english/test/data/en_whitelist.txt index 6bc9bc4..96abeb6 100644 --- a/itn/english/test/data/en_whitelist.txt +++ b/itn/english/test/data/en_whitelist.txt @@ -9,3 +9,4 @@ r t x => RTX cat five e => CAT5e c u d n n => cuDNN p c i e x eight => PCIe x8 +l g a eleven fifty => LGA 1150 From e242041b6ab6623f37f432fcd9ffca9e5be053a5 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 10:05:50 +0800 Subject: [PATCH 13/13] feat: 469/470 (99.8%) NeMo coverage - electronic: exclude "dot" as email username first token - money: reject singular "one" with plural currency ("one dollars") - telephone: add credit card 4-6-4/4-6-5 formats with optional country code - telephone: exclude "a" as serial first char to avoid "a thirty six" -> "a36" - punctuation: add Punctuation class, split punct from words ("twenty!" -> "20 !") --- itn/english/inverse_normalizer.py | 8 +++-- itn/english/rules/electronic.py | 5 ++-- itn/english/rules/money.py | 39 +++++++++++++++---------- itn/english/rules/punctuation.py | 31 ++++++++++++++++++++ itn/english/rules/telephone.py | 29 ++++++++++++------ itn/english/test/data/en_electronic.txt | 1 + itn/english/test/data/en_money.txt | 1 + itn/english/test/data/en_telephone.txt | 3 ++ itn/english/test/data/en_word.txt | 1 + 9 files changed, 90 insertions(+), 28 deletions(-) create mode 100644 itn/english/rules/punctuation.py diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index d89a3c4..ff30a6c 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -24,6 +24,7 @@ from itn.english.rules.measure import Measure from itn.english.rules.money import Money from itn.english.rules.ordinal import Ordinal +from itn.english.rules.punctuation import Punctuation from itn.english.rules.telephone import Telephone from itn.english.rules.time import Time from itn.english.rules.whitelist import Whitelist @@ -52,8 +53,9 @@ def build_tagger_and_verbalizer(self): whitelist = Whitelist() word = Word() char = Char() + punctuation = Punctuation() - tagger = ( + classify = ( add_weight(date.tagger, 1.09) | add_weight(time.tagger, 1.1) | add_weight(measure.tagger, 1.1) @@ -68,7 +70,8 @@ def build_tagger_and_verbalizer(self): | add_weight(char.tagger, 100) ).optimize() - token = tagger + punct = add_weight(punctuation.tagger, 1.1) + token = closure(punct + delete(" ").ques) + classify + closure(delete(" ").ques + punct) graph = token + closure(self.DELETE_EXTRA_SPACE + token) self.tagger = delete(" ").star + graph + delete(" ").star @@ -85,6 +88,7 @@ def build_tagger_and_verbalizer(self): | whitelist.verbalizer | word.verbalizer | char.verbalizer + | punctuation.verbalizer ).optimize() self.verbalizer = (verbalizer + self.INSERT_SPACE).star @ self.build_rule( diff --git a/itn/english/rules/electronic.py b/itn/english/rules/electronic.py index 9e86a1e..3563844 100644 --- a/itn/english/rules/electronic.py +++ b/itn/english/rules/electronic.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, invert, string_file +from pynini import accep, closure, cross, difference, invert, string_file from pynini.lib.pynutil import add_weight, delete, insert from tn.processor import Processor @@ -35,7 +35,8 @@ def build_tagger(self): char = self.ALPHA | digit | zero word = add_weight(closure(self.ALPHA, 2), 0.1) token = char | symbols | word - component = token + closure(ds + token) + first_token = char | difference(word, accep("dot")) + component = first_token + closure(ds + token) dot = cross("dot", ".") domain = component + (ds + dot + ds + component).plus diff --git a/itn/english/rules/money.py b/itn/english/rules/money.py index 52f2e77..766ce53 100644 --- a/itn/english/rules/money.py +++ b/itn/english/rules/money.py @@ -37,14 +37,15 @@ def build_tagger(self): ds = delete(" ") currency_labels = load_labels(get_abs_path("../itn/english/data/currency.tsv")) - currency_pairs = [] - for symbol, name in currency_labels: - currency_pairs.append((name, symbol)) + singular_pairs = [(name, symbol) for symbol, name in currency_labels] + plural_pairs = [] + for name, symbol in singular_pairs: if name.endswith("s"): - currency_pairs.append((name + "es", symbol)) + plural_pairs.append((name + "es", symbol)) else: - currency_pairs.append((name + "s", symbol)) - currency = union(*[cross(name, symbol) for name, symbol in currency_pairs]).optimize() + plural_pairs.append((name + "s", symbol)) + currency_singular = union(*[cross(name, symbol) for name, symbol in singular_pairs]).optimize() + currency_plural = union(*[cross(name, symbol) for name, symbol in singular_pairs + plural_pairs]).optimize() cent = cross("cent", "") | cross("cents", "") magnitudes = load_labels(get_abs_path("../itn/english/data/magnitudes.tsv")) @@ -57,15 +58,23 @@ def build_tagger(self): compose(cardinal_graph, self.DIGIT ** 3), ) cardinal_with_hundred = cardinal_graph | with_hundred + not_one = self.DIGIT ** (2, ...) | (self.DIGIT - accep("1")) + cardinal_plural = compose(cardinal_with_hundred, not_one) + # "one dollar" (singular) vs "two dollars" (plural) + one = cross("one", "1") integer_graph = ( - insert('value: "') + cardinal_with_hundred + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + insert('value: "') + cardinal_plural + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + integer_graph |= ( + insert('value: "') + one + insert('"') + + ds + insert(' currency: "') + currency_singular + insert('"') ) # "fifty million dollars" / "four hundred billion won" quantity_graph = ( insert('value: "') + cardinal_small + insert('"') + ds + insert(' quantity: "') + magnitude + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') ) # "two point five billion dollars" digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) @@ -76,30 +85,30 @@ def build_tagger(self): insert('value: "') + cardinal_graph + insert(".") + ds + delete("point") + ds + frac + insert('"') + ds + insert(' quantity: "') + magnitude + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') ) # "twenty point five o six dollars" (decimal without quantity) decimal_graph = ( insert('value: "') + cardinal_graph + insert(".") + ds + delete("point") + ds + frac + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') ) # "point five o six dollars" decimal_no_int = ( insert('value: ".') + delete("point") + ds + frac + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') ) # "one fifty five dollars" => $155 (missing "hundred") with_hundred = ( insert('value: "') + cardinal_small + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') ) # cents cents_graph = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 100) if _num_to_word(x)]) with_cents = ( insert('value: "') + cardinal_graph + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ds + (delete("and") + ds).ques + insert(' decimal: "') + cents_graph + insert('"') + ds + cent @@ -107,7 +116,7 @@ def build_tagger(self): # "seventy five dollars sixty three" (no "cents" word) dollars_amount = ( insert('value: "') + cardinal_graph + insert('"') - + ds + insert(' currency: "') + currency + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ds + insert(' decimal: "') + cents_graph + insert('"') ) cents_only = ( diff --git a/itn/english/rules/punctuation.py b/itn/english/rules/punctuation.py new file mode 100644 index 0000000..36d169d --- /dev/null +++ b/itn/english/rules/punctuation.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import union +from pynini.lib.pynutil import insert + +from tn.processor import Processor + + +class Punctuation(Processor): + + def __init__(self): + super().__init__(name="punctuation", ordertype="itn") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + punct = union(*"!#$%&'()*+,-./:;<=>?@^_`{|}~") + tagger = insert('value: "') + punct + insert('"') + self.tagger = self.add_tokens(tagger) diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index 715e27f..9576c26 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file, union +from pynini import closure, cross, difference, string_file, union from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal @@ -82,17 +82,28 @@ def build_tagger(self): ip = ip_token + (cross(" dot ", ".") + ip_token) ** 3 graph |= insert('number_part: "') + add_weight(ip, -0.001) + insert('"') - # credit card: XXXX XXXX XXXX XXXX or XXXX XXXXXX XXXXX - cc = seq @ ( - self.DIGIT ** 4 + insert(" ") + self.DIGIT ** 4 - + insert(" ") + self.DIGIT ** 4 + insert(" ") + self.DIGIT ** 4 + # credit card: 4-4-4-4 (16), 4-6-4 (14), 4-6-5 (15) + space = insert(" ") + D = self.DIGIT + cc_format = ( + D ** 4 + space + D ** 4 + space + D ** 4 + space + D ** 4 + | D ** 4 + space + D ** 6 + space + D ** 4 + | D ** 4 + space + D ** 6 + space + D ** 5 ) - graph |= insert('number_part: "') + cc + insert('"') + cc = seq @ cc_format + graph |= optional_cc + insert('number_part: "') + cc + insert('"') # serial: mixed alpha+digits, at least one digit, length >= 3 - serial_char = add_weight(single, 0.001) | add_weight(two_digit, -0.001) | self.ALPHA - serial = serial_char + closure(ds + serial_char, 1) - serial = serial @ (closure(self.ALPHA | self.DIGIT) + self.DIGIT + closure(self.ALPHA | self.DIGIT)) + # Exclude "a" as first char to avoid "a thirty six" -> "a36" + not_a = difference(self.ALPHA, union("a", "A")) + serial_digit = single | add_weight(two_digit, -0.002) + serial_char = serial_digit | self.ALPHA + seq1 = (not_a | serial_digit) + closure(ds + serial_char, 2) + seq1 |= serial_char + closure(ds + (single | self.ALPHA), 2) + seq2 = self.ALPHA + closure(ds + self.ALPHA, 1) + closure(ds + two_digit, 1) + seq2 |= not_a + closure(ds + two_digit, 1) + seq2 |= two_digit + closure(ds + two_digit, 1) + closure(ds + self.ALPHA, 1) + serial = (seq1 | seq2) @ (closure(self.ALPHA | D) + D + closure(self.ALPHA | D)) graph |= insert('number_part: "') + add_weight(serial, 2.0) + insert('"') self.tagger = self.add_tokens(graph) diff --git a/itn/english/test/data/en_electronic.txt b/itn/english/test/data/en_electronic.txt index c933ddc..aba14ec 100644 --- a/itn/english/test/data/en_electronic.txt +++ b/itn/english/test/data/en_electronic.txt @@ -22,3 +22,4 @@ n vidia dot com => nvidia.com abc at gmail dot com => abc@gmail.com athreed at gmail dot com => athreed@gmail.com kore dot ai => kore.ai +dot three at g mail dot com => dot 3@gmail.com diff --git a/itn/english/test/data/en_money.txt b/itn/english/test/data/en_money.txt index 8e34d45..b1e5806 100644 --- a/itn/english/test/data/en_money.txt +++ b/itn/english/test/data/en_money.txt @@ -49,3 +49,4 @@ one fifty five dollars => $155 fifteen hundred dollars => $1500 ninety nine hundred dollars => $9900 ninety nine hundred and fifteen dollars and one cent => $9915.01 +one dollars => one dollars diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index 121e340..d0130b4 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -18,3 +18,6 @@ five w k r a three one => 5wkra31 x eighty six => x86 x three eighty six => x386 r t x forty fifty t i => RTX 4050ti +four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005 +a thirty six => a 36 +a ten eighty p display => a 1080p display diff --git a/itn/english/test/data/en_word.txt b/itn/english/test/data/en_word.txt index 00dbde4..b77a69e 100644 --- a/itn/english/test/data/en_word.txt +++ b/itn/english/test/data/en_word.txt @@ -3,6 +3,7 @@ , one , two , three , four => , one , two , three , four e s three => es3 yahoo! => yahoo! +twenty! => 20 ! x => x — => — aaa => aaa