diff --git a/itn/english/data/measurements.tsv b/itn/english/data/measurements.tsv index 894eacd..4f50f9a 100644 --- a/itn/english/data/measurements.tsv +++ b/itn/english/data/measurements.tsv @@ -143,3 +143,7 @@ gy gray sv sievert cwt hundredweight cc c c +mph miles per hour +sq ft square feet +kgf/cm² kilograms force per square centimeter +kgf/cm² kilogram force per square centimeter diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index 446ffe3..ff30a6c 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -13,12 +13,22 @@ # limitations under the License. from importlib_resources import files +from pynini import closure from pynini.lib.pynutil import add_weight, delete from itn.english.rules.cardinal import Cardinal from itn.english.rules.char import Char +from itn.english.rules.date import Date from itn.english.rules.decimal import Decimal +from itn.english.rules.electronic import Electronic +from itn.english.rules.measure import Measure +from itn.english.rules.money import Money from itn.english.rules.ordinal import Ordinal +from itn.english.rules.punctuation import Punctuation +from itn.english.rules.telephone import Telephone +from itn.english.rules.time import Time +from itn.english.rules.whitelist import Whitelist +from itn.english.rules.word import Word from tn.processor import Processor @@ -34,23 +44,53 @@ def build_tagger_and_verbalizer(self): cardinal = Cardinal() ordinal = Ordinal(cardinal=cardinal) decimal = Decimal(cardinal=cardinal) + date = Date(cardinal=cardinal, ordinal=ordinal) + time = Time(cardinal=cardinal) + measure = Measure(cardinal=cardinal, decimal=decimal) + money = Money(cardinal=cardinal, decimal=decimal) + telephone = Telephone(cardinal=cardinal) + electronic = Electronic() + whitelist = Whitelist() + word = Word() char = Char() + punctuation = Punctuation() - tagger = ( - add_weight(ordinal.tagger, 1.0) - | add_weight(decimal.tagger, 1.01) - | add_weight(cardinal.tagger, 1.02) + classify = ( + add_weight(date.tagger, 1.09) + | add_weight(time.tagger, 1.1) + | add_weight(measure.tagger, 1.1) + | add_weight(money.tagger, 1.08) + | add_weight(whitelist.tagger, 1.01) + | add_weight(telephone.tagger, 1.1) + | add_weight(electronic.tagger, 1.1) + | add_weight(ordinal.tagger, 1.09) + | add_weight(decimal.tagger, 1.1) + | add_weight(cardinal.tagger, 1.1) + | add_weight(word.tagger, 50) | add_weight(char.tagger, 100) ).optimize() - tagger = tagger.star - self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]") + punct = add_weight(punctuation.tagger, 1.1) + token = closure(punct + delete(" ").ques) + classify + closure(delete(" ").ques + punct) + graph = token + closure(self.DELETE_EXTRA_SPACE + token) + self.tagger = delete(" ").star + graph + delete(" ").star verbalizer = ( cardinal.verbalizer | ordinal.verbalizer | decimal.verbalizer + | date.verbalizer + | time.verbalizer + | measure.verbalizer + | money.verbalizer + | telephone.verbalizer + | electronic.verbalizer + | whitelist.verbalizer + | word.verbalizer | char.verbalizer + | punctuation.verbalizer ).optimize() - self.verbalizer = verbalizer.star + self.verbalizer = (verbalizer + self.INSERT_SPACE).star @ self.build_rule( + self.DELETE_EXTRA_SPACE + ) @ self.build_rule(delete(" "), r="[EOS]") diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py index 4f59719..71fc4b0 100644 --- a/itn/english/rules/cardinal.py +++ b/itn/english/rules/cardinal.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file, union -from pynini.lib.pynutil import delete, insert +from pynini import closure, cross, difference, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert from tn.processor import Processor from tn.utils import get_abs_path @@ -35,7 +35,8 @@ def build_tagger(self): # 1~9, 10~19, 20~99 one_digit = digit - two_digit = teen | (ties + (ds + digit | insert("0"))) + two_digit = teen | (ties + (ds + digit | add_weight(insert("0"), 0.1))) + self.graph_two_digit = two_digit up_to_99 = one_digit | two_digit # one hundred, one hundred twenty three, one hundred one @@ -47,6 +48,7 @@ def build_tagger(self): # 1~999 up_to_999 = up_to_99 | hundreds + self.up_to_999 = up_to_999 # 1~999 with zero-padding to 3 digits up_to_999_padded = hundreds | insert("0") + two_digit | insert("00") + one_digit @@ -111,10 +113,17 @@ def _with_mag_padded(name): graph = (delete_and @ graph).optimize() self.graph = graph + self.graph_no_exception = graph + + # exclude 0-12 from cardinal tagger (they stay as words) + from itn.english.rules.time import _num_to_word + exception_labels = ["zero"] + [_num_to_word(x) for x in range(1, 13)] + exception = union(*exception_labels).optimize() + graph_with_exception = (difference(self.VSIGMA, exception) @ graph).optimize() minus = delete("minus") | delete("negative") optional_minus = closure(insert('negative: "-" ') + minus + ds, 0, 1) - final_graph = optional_minus + insert('integer: "') + graph + insert('"') + final_graph = optional_minus + insert('integer: "') + graph_with_exception + insert('"') self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py index 42dc46d..4804e6e 100644 --- a/itn/english/rules/date.py +++ b/itn/english/rules/date.py @@ -56,6 +56,8 @@ def build_tagger(self): # Year as two groups of two digits: "twenty twelve" => 2012 year_two_parts = (teen | two_digit) + ds + (two_digit | oh_digit | teen) + # 3-digit year: "seven fifty" => 750 + year_three_digit = digit + ds + (two_digit | oh_digit | teen) # Year as "X thousand Y": "two thousand twelve" => 2012 # Need zero-padded variants so "two thousand three" => 2003 @@ -116,9 +118,32 @@ def build_tagger(self): + po ) # Year only => "twenty twelve", "two thousand three" - graph_y = add_weight(year, 0.01) + po + graph_y = year + po - final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y + # Decades: "nineteen eighties" => 1980s + decade_suffix = closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s")) + decade_word = pynini.compose(decade_suffix, ties | cross("ten", "10")) + graph_decade = ( + insert('year: "') + (teen | two_digit) + ds + decade_word + insert('0s"') + po + ) + + # Quarter: "second quarter of twenty twenty two" => Q2 2022 + quarter_num = ( + cross("first", "1") | cross("second", "2") + | cross("third", "3") | cross("fourth", "4") + ) + graph_quarter = ( + insert('day: "Q') + quarter_num + insert('"') + + ds + delete("quarter") + ds + delete("of") + ds + + insert(' year: "') + year_graph + insert('"') + po + ) + + # BC/AD suffix + bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD")) + year_graph_with_3digit = year_graph | year_three_digit + graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po + + final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): @@ -160,6 +185,8 @@ def build_verbalizer(self): graph_dmy = day + self.DELETE_SPACE + insert(" ") + month + optional_year # year only graph_y = year + # day + year (for quarter: Q2 2022) + graph_dy = day + self.DELETE_SPACE + insert(" ") + year - graph = (graph_mdy | graph_dmy | graph_y) + self.DELETE_SPACE + delete_po + graph = (graph_mdy | graph_dmy | graph_dy | graph_y) + self.DELETE_SPACE + delete_po self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/decimal.py b/itn/english/rules/decimal.py index 5446f09..e89c797 100644 --- a/itn/english/rules/decimal.py +++ b/itn/english/rules/decimal.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file +from pynini import closure, cross, string_file, union from pynini.lib.pynutil import delete, insert from itn.english.rules.cardinal import Cardinal from tn.processor import Processor -from tn.utils import get_abs_path +from tn.utils import get_abs_path, load_labels class Decimal(Processor): @@ -45,6 +45,25 @@ def build_tagger(self): point = delete("point") graph = optional_negative + closure(integer_part + ds, 0, 1) + point + ds + frac_part + + # quantity: "five point two million" => 5.2 million + quantities = load_labels(get_abs_path("../itn/english/data/numbers/thousands.tsv")) + quantity_all = union(*[x[0] for x in quantities]) + quantity_no_thousand = union(*[x[0] for x in quantities if x[0] != "thousand"]) + # decimal + quantity: five point two million, 164.58 thousand + quantity_graph = ( + optional_negative + integer_part + ds + point + ds + frac_part + + ds + insert(' quantity: "') + quantity_all + insert('"') + ) + # cardinal (up to 999) + quantity: four hundred million, five million + # exclude thousand to let cardinal handle "ten thousand" => 10000 + cardinal_small = self.cardinal.up_to_999 + cardinal_quantity = ( + optional_negative + insert('integer_part: "') + cardinal_small + insert('"') + + ds + insert(' quantity: "') + quantity_no_thousand + insert('"') + ) + graph |= quantity_graph | cardinal_quantity + self.tagger = self.add_tokens(graph) def build_verbalizer(self): @@ -56,6 +75,11 @@ def build_verbalizer(self): + delete('"') + self.NOT_QUOTE.plus + delete('"') ) optional_fractional = closure(fractional + self.DELETE_SPACE, 0, 1) - graph = optional_sign + optional_integer + optional_fractional + quantity = ( + insert(" ") + delete('quantity:') + self.DELETE_SPACE + + delete('"') + self.NOT_QUOTE.plus + delete('"') + ) + optional_quantity = closure(quantity + self.DELETE_SPACE, 0, 1) + graph = optional_sign + optional_integer + optional_fractional + optional_quantity self.numbers = graph self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/electronic.py b/itn/english/rules/electronic.py index 46ca328..3563844 100644 --- a/itn/english/rules/electronic.py +++ b/itn/english/rules/electronic.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, invert, string_file +from pynini import accep, closure, cross, difference, invert, string_file from pynini.lib.pynutil import add_weight, delete, insert from tn.processor import Processor @@ -28,90 +28,49 @@ def __init__(self): def build_tagger(self): ds = delete(" ") - - # Single characters: digits and letters digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) - alpha_or_digit = self.ALPHA | digit | zero - - # Symbols from TSV (symbol\tname): invert to get name -> symbol - symbols = invert( - string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv")) - ) + symbols = invert(string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv"))) - # A "token" is either a single char (letter/digit/symbol) or a - # multi-letter word kept verbatim (e.g. "gmail", "nvidia"). - # Multi-letter words have lower priority so spelled-out letters are preferred. - word = add_weight(closure(self.ALPHA, 2), 0.01) - token = alpha_or_digit | symbols | word + char = self.ALPHA | digit | zero + word = add_weight(closure(self.ALPHA, 2), 0.1) + token = char | symbols | word + first_token = char | difference(word, accep("dot")) + component = first_token + closure(ds + token) - # A component is one or more tokens separated by spaces - component = token + closure(ds + token) + dot = cross("dot", ".") + domain = component + (ds + dot + ds + component).plus username = insert('username: "') + component + insert('"') + domain_field = insert('domain: "') + domain + insert('"') - # Domain: component(s) separated by "dot" => "." - dot = cross("dot", ".") - domain_content = component + closure(ds + dot + ds + component) - domain = insert('domain: "') + domain_content + insert('"') - - # Email: username at domain - graph_email = ( - username - + ds - + delete("at") - + ds - + insert(" ") - + domain - ) - - # URL protocol: "h t t p colon slash slash" or "h t t p s colon slash slash" + # Email: X at Y dot Z (requires "at" keyword) + graph_email = username + ds + delete("at") + ds + insert(" ") + domain_field + + # URL: requires protocol or www prefix http = cross("h t t p", "http") https = cross("h t t p s", "https") - colon_slash_slash = cross(" colon slash slash ", "://") - protocol_start = (http | https) + colon_slash_slash - - # www prefix + protocol = (http | https) + cross(" colon slash slash ", "://") www = cross("w w w", "www") - # URL: [protocol] [www.] domain - url_content = ( - closure(protocol_start, 0, 1) - + closure(www + ds + dot + ds, 0, 1) - + domain_content - ) - graph_url = insert('protocol: "') + url_content + insert('"') + # protocol + [www.] + domain + url_with_protocol = protocol + closure(www + ds + dot + ds, 0, 1) + domain + # www. + domain (no protocol) + url_with_www = www + ds + dot + ds + domain + # domain only (must have dot): nvidia dot com + url_domain_only = domain + + graph_url = insert('protocol: "') + (url_with_protocol | url_with_www | url_domain_only) + insert('"') final_graph = graph_email | graph_url self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): - username = ( - delete("username:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') - ) - domain = ( - delete("domain:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') - ) - protocol = ( - delete("protocol:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') - ) - - # Email: username@domain + username = delete("username:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"') + domain = delete("domain:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"') + protocol = delete("protocol:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"') + graph_email = username + self.DELETE_SPACE + insert("@") + domain - # URL: just output the protocol content directly graph_url = protocol - graph = graph_email | graph_url - self.verbalizer = self.delete_tokens(graph) + self.verbalizer = self.delete_tokens(graph_email | graph_url) diff --git a/itn/english/rules/money.py b/itn/english/rules/money.py new file mode 100644 index 0000000..766ce53 --- /dev/null +++ b/itn/english/rules/money.py @@ -0,0 +1,145 @@ +# Copyright (c) 2026 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import accep, closure, compose, cross, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert + +from itn.english.rules.cardinal import Cardinal +from itn.english.rules.decimal import Decimal +from itn.english.rules.time import _num_to_word +from tn.processor import Processor +from tn.utils import get_abs_path, load_labels + + +class Money(Processor): + + def __init__(self, cardinal=None, decimal=None): + super().__init__(name="money", ordertype="itn") + self.cardinal = cardinal or Cardinal() + self.decimal = decimal or Decimal(cardinal=self.cardinal) + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + cardinal_graph = self.cardinal.graph + cardinal_small = self.cardinal.up_to_999 + ds = delete(" ") + + currency_labels = load_labels(get_abs_path("../itn/english/data/currency.tsv")) + singular_pairs = [(name, symbol) for symbol, name in currency_labels] + plural_pairs = [] + for name, symbol in singular_pairs: + if name.endswith("s"): + plural_pairs.append((name + "es", symbol)) + else: + plural_pairs.append((name + "s", symbol)) + currency_singular = union(*[cross(name, symbol) for name, symbol in singular_pairs]).optimize() + currency_plural = union(*[cross(name, symbol) for name, symbol in singular_pairs + plural_pairs]).optimize() + + cent = cross("cent", "") | cross("cents", "") + magnitudes = load_labels(get_abs_path("../itn/english/data/magnitudes.tsv")) + magnitude = union(*[name for symbol, name in magnitudes if name != "thousand"]) + + # "two dollars" + # add "one fifty five" => "one hundred fifty five" => 155 + with_hundred = compose( + closure(self.NOT_SPACE) + accep(" ") + insert("hundred ") + self.VSIGMA, + compose(cardinal_graph, self.DIGIT ** 3), + ) + cardinal_with_hundred = cardinal_graph | with_hundred + not_one = self.DIGIT ** (2, ...) | (self.DIGIT - accep("1")) + cardinal_plural = compose(cardinal_with_hundred, not_one) + # "one dollar" (singular) vs "two dollars" (plural) + one = cross("one", "1") + integer_graph = ( + insert('value: "') + cardinal_plural + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + integer_graph |= ( + insert('value: "') + one + insert('"') + + ds + insert(' currency: "') + currency_singular + insert('"') + ) + # "fifty million dollars" / "four hundred billion won" + quantity_graph = ( + insert('value: "') + cardinal_small + insert('"') + + ds + insert(' quantity: "') + magnitude + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + # "two point five billion dollars" + digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) + zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) + frac_d = digit | zero | cross("o", "0") + frac = closure(frac_d + ds) + frac_d + decimal_quantity_graph = ( + insert('value: "') + cardinal_graph + insert(".") + + ds + delete("point") + ds + frac + insert('"') + + ds + insert(' quantity: "') + magnitude + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + # "twenty point five o six dollars" (decimal without quantity) + decimal_graph = ( + insert('value: "') + cardinal_graph + insert(".") + + ds + delete("point") + ds + frac + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + # "point five o six dollars" + decimal_no_int = ( + insert('value: ".') + delete("point") + ds + frac + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + # "one fifty five dollars" => $155 (missing "hundred") + with_hundred = ( + insert('value: "') + cardinal_small + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + ) + + # cents + cents_graph = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 100) if _num_to_word(x)]) + with_cents = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + + ds + (delete("and") + ds).ques + + insert(' decimal: "') + cents_graph + insert('"') + + ds + cent + ) + # "seventy five dollars sixty three" (no "cents" word) + dollars_amount = ( + insert('value: "') + cardinal_graph + insert('"') + + ds + insert(' currency: "') + currency_plural + insert('"') + + ds + insert(' decimal: "') + cents_graph + insert('"') + ) + cents_only = ( + insert('currency: "$" decimal: "') + cents_graph + insert('"') + + ds + cent + ) + + graph = ( + integer_graph | add_weight(quantity_graph, -1) | add_weight(decimal_quantity_graph, -1) + | decimal_graph | decimal_no_int + | with_cents | dollars_amount | cents_only + ) + self.tagger = self.add_tokens(graph) + + def build_verbalizer(self): + currency = delete('currency: "') + self.NOT_QUOTE.plus + delete('"') + value = delete(' value: "') + self.NOT_QUOTE.plus + delete('"') + decimal = delete(' decimal: "') + self.NOT_QUOTE.plus + delete('"') + quantity = delete(' quantity: "') + self.NOT_QUOTE.plus + delete('"') + + graph = currency + value + graph += closure(insert(".") + self.DELETE_SPACE + decimal, 0, 1) + graph += closure(insert(" ") + self.DELETE_SPACE + quantity, 0, 1) + graph |= currency + insert("0.") + self.DELETE_SPACE + decimal + + self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/punctuation.py b/itn/english/rules/punctuation.py new file mode 100644 index 0000000..36d169d --- /dev/null +++ b/itn/english/rules/punctuation.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import union +from pynini.lib.pynutil import insert + +from tn.processor import Processor + + +class Punctuation(Processor): + + def __init__(self): + super().__init__(name="punctuation", ordertype="itn") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + punct = union(*"!#$%&'()*+,-./:;<=>?@^_`{|}~") + tagger = insert('value: "') + punct + insert('"') + self.tagger = self.add_tokens(tagger) diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index 0ce5f46..9576c26 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file -from pynini.lib.pynutil import delete, insert +from pynini import closure, cross, difference, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal from tn.processor import Processor @@ -30,42 +30,87 @@ def __init__(self, cardinal=None): def build_tagger(self): ds = delete(" ") - - # Single digit: spoken word -> digit character digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) - single_digit = digit | zero | cross("o", "0") | cross("oh", "0") - - # 10 digits formatted as XXX-XXX-XXXX - ten_digits = ( - single_digit + ds + single_digit + ds + single_digit - + insert("-") - + ds + single_digit + ds + single_digit + ds + single_digit - + insert("-") - + ds + single_digit + ds + single_digit + ds + single_digit + ds + single_digit - ) + single = digit | zero | cross("o", "0") | cross("oh", "0") + + # "double X" => XX + double = union(*[cross(f"double {w}", f"{d}{d}") + for w, d in [("one","1"),("two","2"),("three","3"),("four","4"), + ("five","5"),("six","6"),("seven","7"),("eight","8"), + ("nine","9"),("zero","0"),("oh","0"),("o","0")]]) + + # two-digit cardinal: twenty three => 23 (uses graph_two_digit for proper space handling) + two_digit = self.cardinal.graph_two_digit - # Optional country code: "plus X" or just digits before the main number - country_code_digits = ( - closure(single_digit + ds, 0, 2) + single_digit + # a token is 1 or 2 digits + token = single | double | add_weight(two_digit, 0.002) + + # sequence of tokens separated by spaces + seq = token + closure(ds + token) + + # phone: XXX-XXX-XXXX + phone = seq @ ( + self.DIGIT ** 3 + insert("-") + self.DIGIT ** 3 + insert("-") + self.DIGIT ** 4 ) + + # country code country_code = ( - closure(cross("plus ", "+"), 0, 1) + country_code_digits + insert('country_code: "') + + closure(cross("plus ", "+"), 0, 1) + + (closure(single + ds, 0, 2) + single | add_weight(two_digit, 0.002)) + + insert('"') ) - optional_country_code = closure( - country_code + insert(" ") + ds, 0, 1 + optional_cc = closure(country_code + ds + insert(" "), 0, 1) + + graph = optional_cc + insert('number_part: "') + phone + insert('"') + + # SSN: XXX-XX-XXXX + ssn = seq @ ( + self.DIGIT ** 3 + insert("-") + self.DIGIT ** 2 + insert("-") + self.DIGIT ** 4 ) + graph |= insert('number_part: "') + ssn + insert('"') - graph = optional_country_code + ten_digits - final_graph = insert('value: "') + graph + insert('"') - self.tagger = self.add_tokens(final_graph) + # IP: X.X.X.X + ip_token = ( + single + closure(ds + single, 0, 2) + | double + | add_weight(two_digit, 0.002) + | single + ds + two_digit + | two_digit + ds + single + ) + ip = ip_token + (cross(" dot ", ".") + ip_token) ** 3 + graph |= insert('number_part: "') + add_weight(ip, -0.001) + insert('"') - def build_verbalizer(self): - value = ( - delete("value:") - + self.DELETE_SPACE - + delete('"') - + self.NOT_QUOTE.plus - + delete('"') + # credit card: 4-4-4-4 (16), 4-6-4 (14), 4-6-5 (15) + space = insert(" ") + D = self.DIGIT + cc_format = ( + D ** 4 + space + D ** 4 + space + D ** 4 + space + D ** 4 + | D ** 4 + space + D ** 6 + space + D ** 4 + | D ** 4 + space + D ** 6 + space + D ** 5 ) - self.verbalizer = self.delete_tokens(value) + cc = seq @ cc_format + graph |= optional_cc + insert('number_part: "') + cc + insert('"') + + # serial: mixed alpha+digits, at least one digit, length >= 3 + # Exclude "a" as first char to avoid "a thirty six" -> "a36" + not_a = difference(self.ALPHA, union("a", "A")) + serial_digit = single | add_weight(two_digit, -0.002) + serial_char = serial_digit | self.ALPHA + seq1 = (not_a | serial_digit) + closure(ds + serial_char, 2) + seq1 |= serial_char + closure(ds + (single | self.ALPHA), 2) + seq2 = self.ALPHA + closure(ds + self.ALPHA, 1) + closure(ds + two_digit, 1) + seq2 |= not_a + closure(ds + two_digit, 1) + seq2 |= two_digit + closure(ds + two_digit, 1) + closure(ds + self.ALPHA, 1) + serial = (seq1 | seq2) @ (closure(self.ALPHA | D) + D + closure(self.ALPHA | D)) + graph |= insert('number_part: "') + add_weight(serial, 2.0) + insert('"') + + self.tagger = self.add_tokens(graph) + + def build_verbalizer(self): + cc = delete('country_code: "') + self.NOT_QUOTE.plus + delete('"') + num = delete(' number_part: "') + self.NOT_QUOTE.plus + delete('"') + num_only = delete('number_part: "') + self.NOT_QUOTE.plus + delete('"') + graph = cc + self.DELETE_SPACE + insert(" ") + num | num_only + self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/time.py b/itn/english/rules/time.py index dbdea08..f2e5d75 100644 --- a/itn/english/rules/time.py +++ b/itn/english/rules/time.py @@ -12,14 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure, cross, string_file -from pynini.lib.pynutil import delete, insert +from pynini import closure, cross, invert, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal from tn.processor import Processor from tn.utils import get_abs_path +def _num_to_word(n): + ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", + "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", + "seventeen", "eighteen", "nineteen"] + tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] + if n < 20: + return ones[n] + return tens[n // 10] + (" " + ones[n % 10] if n % 10 else "") + + class Time(Processor): def __init__(self, cardinal=None): @@ -29,34 +39,81 @@ def __init__(self, cardinal=None): self.build_verbalizer() def build_tagger(self): - digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) - teen = string_file(get_abs_path("../itn/english/data/numbers/teen.tsv")) - ties = string_file(get_abs_path("../itn/english/data/numbers/ties.tsv")) + cardinal_graph = add_weight(self.cardinal.graph_no_exception, -0.7) time_suffix = string_file(get_abs_path("../itn/english/data/time/time_suffix.tsv")) - time_zone = string_file(get_abs_path("../itn/english/data/time/time_zone.tsv")) + time_zone = invert(string_file(get_abs_path("../itn/english/data/time/time_zone.tsv"))) + to_hour = string_file(get_abs_path("../itn/english/data/time/to_hour.tsv")) + minute_to = string_file(get_abs_path("../itn/english/data/time/minute_to.tsv")) ds = delete(" ") - hour = teen | (insert("0") + digit) - minute = teen | (ties + (ds + digit | insert("0"))) | insert("0") + digit + hour_all = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(0, 24) if _num_to_word(x)]) + hour_12 = union(*[cross(_num_to_word(x), f"{x:02d}") for x in range(1, 13)]) + graph_min_single = union(*[cross(_num_to_word(x), f"0{x}") for x in range(1, 10)]) + graph_min_double = union(*[cross(_num_to_word(x), str(x)) for x in range(10, 60)]) + graph_min_verbose = cross("half", "30") | cross("quarter", "15") - # two thirty => 02:30 - graph = insert('hour: "') + hour + insert('" ') + ds + insert('minute: "') + minute + insert('"') - # eight oclock => 08:00 - oclock = cross("o'clock", "") | cross("oclock", "") - graph |= insert('hour: "') + hour + insert('" minute: "00"') + ds + oclock + # minutes without zero-padding (for minute_to composition) + min_single_raw = union(*[cross(_num_to_word(x), str(x)) for x in range(1, 10)]) + min_double_raw = graph_min_double # already no padding + oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "") + + hour = insert('hour: "') + hour_all + insert('"') + hour12 = insert('hour: "') + hour_12 + insert('"') suffix = ds + insert(' noon: "') + time_suffix + insert('"') zone = ds + insert(' zone: "') + time_zone + insert('"') - graph += suffix.ques + zone.ques + zone_opt = closure(zone, 0, 1) + + # "eight oclock" / "eight oclock gmt" + graph_oclock = hour + ds + insert(' minute: "') + oclock + insert('00"') + zone_opt + # "two o five" + graph_o_min = hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + # "two pm" / "three am est" + graph_h_suffix = hour + insert(' minute: "00"') + suffix + zone_opt + # "two thirty am" + graph_hm_suffix = ( + hour + ds + insert(' minute: "') + graph_min_double + insert('"') + suffix + zone_opt + ) + # "two thirty" (1-12 only, no suffix) + graph_hm = hour12 + ds + insert(' minute: "') + graph_min_double + insert('"') + # "eleven o six pm" + graph_o_min_suffix = ( + hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + suffix + zone_opt + ) + # "half past two", "quarter past two" + graph_past = ( + insert('minute: "') + graph_min_verbose + insert('"') + ds + delete("past") + ds + hour + ) + # "quarter to one" => 12:45 + graph_quarter_to = ( + insert('minute: "') + cross("quarter", "45") + insert('"') + + ds + delete("to") + ds + + insert('hour: "') + to_hour + insert('"') + ) + # "ten to eleven pm" => 10:50 p.m. + graph_min_to = ( + insert('minute: "') + + ((min_single_raw | min_double_raw) @ minute_to) + + insert('"') + + closure(ds + delete("min") + delete("ute").ques + delete("s").ques, 0, 1) + + ds + delete("to") + ds + + insert('hour: "') + to_hour + insert('"') + + suffix + ) - self.tagger = self.add_tokens(graph) + final_graph = ( + graph_oclock | graph_o_min | graph_h_suffix + | graph_hm_suffix | graph_hm | graph_o_min_suffix + | graph_past | graph_quarter_to | graph_min_to + ) + self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): - hours = delete('hour: "') + self.NOT_QUOTE.plus + delete('"') - minutes = delete(' minute: "') + self.NOT_QUOTE.plus + delete('"') - suffix = delete(' noon: "') + self.NOT_QUOTE.plus + delete('"') + hour = delete('hour: "') + self.NOT_QUOTE.plus + delete('"') + minute = delete(' minute: "') + self.NOT_QUOTE.plus + delete('"') + noon = delete(' noon: "') + self.NOT_QUOTE.plus + delete('"') zone = delete(' zone: "') + self.NOT_QUOTE.plus + delete('"') - graph = hours + insert(":") + self.DELETE_SPACE + minutes - graph += closure(insert(" ") + self.DELETE_SPACE + suffix, 0, 1) + graph = hour + insert(":") + self.DELETE_SPACE + minute + graph += closure(insert(" ") + self.DELETE_SPACE + noon, 0, 1) graph += closure(insert(" ") + self.DELETE_SPACE + zone, 0, 1) self.verbalizer = self.delete_tokens(graph) diff --git a/itn/english/rules/word.py b/itn/english/rules/word.py new file mode 100644 index 0000000..46c82bc --- /dev/null +++ b/itn/english/rules/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import difference, union +from pynini.lib.pynutil import insert + +from tn.processor import Processor + + +class Word(Processor): + + def __init__(self): + super().__init__(name="word", ordertype="itn") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + valid_char = difference(self.NOT_SPACE, union('"', "\\")) + tagger = insert('value: "') + valid_char.plus + insert('"') + self.tagger = self.add_tokens(tagger) diff --git a/itn/english/test/data/en_cardinal.txt b/itn/english/test/data/en_cardinal.txt index 1fd0c97..ce8de7e 100644 --- a/itn/english/test/data/en_cardinal.txt +++ b/itn/english/test/data/en_cardinal.txt @@ -1,22 +1,28 @@ -twenty three => 23 -one hundred => 100 -one hundred and one => 101 +nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty => 9789382536130 two hundred and fifty four => 254 -one thousand => 1000 -one thousand two hundred thirty four => 1234 +one hundred forty seven thousand four hundred fifty one => 147451 +one million one hundred fifty six thousand one hundred seventy three => 1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 +ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five => 97808264772792005 +seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine => 17855036657007596110949 +ten quadrillion ten trillion ten million one hundred thousand ten => 10010000010100010 +minus twenty five thousand thirty seven => -25037 +one quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four => 1264301938000104 +minus sixty => -60 +forty six thousand six hundred sixty four => 46664 +sixty => 60 +zero => zero +two million three => 2000003 +one thousand thirteen => 1013 +one thousand one => 1001 +one thousand one hundred => 1100 +one thousand twenty six => 1026 +one thousand one hundred twenty six => 1126 +eighteen million four hundred fifty thousand nine hundred ninety => 18450990 +eighteen million nine hundred forty thousand seven hundred twenty two => 18940722 +eighteen million six hundred ninety thousand nine hundred sixteen => 18690916 +eighteen thousand eight hundred eighty => 18880 eleven hundred => 1100 twenty one hundred => 2100 twenty one hundred and eleven => 2111 -ten thousand => 10000 -one hundred thousand => 100000 -one million => 1000000 -one billion => 1000000000 -one trillion => 1000000000000 -one thousand and one => 1001 -one million one => 1000001 -one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 -zero => 0 -five => 5 -thirty => 30 -minus forty two => -42 -negative five => -5 +eleven hundred twenty one => 1121 diff --git a/itn/english/test/data/en_date.txt b/itn/english/test/data/en_date.txt index b0aacec..de5be29 100644 --- a/itn/english/test/data/en_date.txt +++ b/itn/english/test/data/en_date.txt @@ -1,17 +1,36 @@ july twenty fifth two thousand twelve => july 25 2012 +nineteen eighties => 1980s +two thousand and twenty => 2020 +two thousand and nine => 2009 the twenty fifth of july twenty twelve => 25 july 2012 the twenty fifth of july two thousand twelve => 25 july 2012 the twenty second of july twenty twelve => 22 july 2012 the fifteenth of january => 15 january +the seventeenth of may twenty ten => 17 may 2010 january first => january 1 july twenty second two thousand eight => july 22 2008 june thirty => june 30 july twenty fifth twenty twelve => july 25 2012 +nineteen seventeen => 1917 twenty twelve => 2012 +march sixteen sixty five => march 1665 +sixteen sixty five => 1665 july two thousand twelve => july 2012 october nineteen oh five => october 1905 +july fifteen o six => july 1506 +the twenty fifth of july twenty twelve => 25 july 2012 +july twenty fifth twenty twelve => july 25 2012 +july twenty fifth two thousand twelve => july 25 2012 +july one thousand eight hundred seventy six => july 1876 february twenty fifth twenty sixteen => february 25 2016 november twenty fourth twenty fourteen => november 24 2014 -two thousand and three => 2003 -two thousand and twenty => 2020 +nineteen ninety four => 1994 +two thousand three => 2003 +one thousand eight => 1008 nineteen seventy six => 1976 +june twentieth twenty fourteen => june 20 2014 +nineteen seventy three => 1973 +nineteen seventy five => 1975 +eleven fifty five => 1155 +second quarter of twenty twenty two => Q2 2022 +seven fifty b c => 750BC diff --git a/itn/english/test/data/en_decimal.txt b/itn/english/test/data/en_decimal.txt index e787cca..405242b 100644 --- a/itn/english/test/data/en_decimal.txt +++ b/itn/english/test/data/en_decimal.txt @@ -1,8 +1,63 @@ -twelve point five => 12.5 -three point one four => 3.14 -minus three point one four => -3.14 -point o five => .05 -point five => .5 -one point zero => 1.0 -zero point five => 0.5 -twenty three point four five six => 23.456 +five point two million => 5.2 million +one hundred sixty four point five eight thousand => 164.58 thousand +four hundred million => 400 million +fifty billion => 50 billion +four hundred five billion => 405 billion +four point eight five billion => 4.85 billion +one hundred billion => 100 billion +one hundred ten billion => 110 billion +one hundred thirty two billion => 132 billion +one point eight four billion => 1.84 billion +one point eight one billion => 1.81 billion +one point five nine billion => 1.59 billion +one point four five three billion => 1.453 billion +one point seven two billion => 1.72 billion +one point two five billion => 1.25 billion +thirteen billion => 13 billion +thirty billion => 30 billion +two thousand eight hundred five point eight seven three billion => 2805.873 billion +seventy trillion => 70 trillion +thirteen million => 13 million +eighteen billion => 18 billion +four hundred fifty million => 450 million +one hundred thirty million => 130 million +ten million => 10 million +four hundred million => 400 million +five million => 5 million +five hundred million => 500 million +twelve million => 12 million +thirteen million => 13 million +four million => 4 million +forty five million => 45 million +fifteen million => 15 million +fifteen trillion => 15 trillion +fifteen billion => 15 billion +two million => 2 million +eight million => 8 million +point one two o five => .1205 +minus sixty point two four zero zero => -60.2400 +zero point two six => 0.26 +point zero two => .02 +sixty point two => 60.2 +eighteen => 18 +eighteen point eight five => 18.85 +eighteen point five o => 18.50 +eighteen point five six => 18.56 +eighteen point nine => 18.9 +eighteen point o five => 18.05 +eighteen point one two => 18.12 +eighteen point o one => 18.01 +eighteen point o o o => 18.000 +eighteen point six => 18.6 +eighteen point three o o => 18.300 +eighteen point three six => 18.36 +eighteen point two five => 18.25 +eighteen point two two => 18.22 +eight hundred eighteen point three o three => 818.303 +eight hundred eight point eight => 808.8 +eight hundred eight point zero => 808.0 +eight hundred eighty eight point one => 888.1 +eight hundred eighty four point three => 884.3 +eight hundred eighty two point eight => 882.8 +eight hundred eighty two point zero => 882.0 +eight hundred forty five point nine four => 845.94 diff --git a/itn/english/test/data/en_electronic.txt b/itn/english/test/data/en_electronic.txt index a296c65..aba14ec 100644 --- a/itn/english/test/data/en_electronic.txt +++ b/itn/english/test/data/en_electronic.txt @@ -1,5 +1,25 @@ +a dot b c at g mail dot com => a.bc@gmail.com a at gmail dot com => a@gmail.com +a at m s n dot fr => a@msn.fr +a at a o l dot com => a@aol.com +a at m s n dot com => a@msn.com +a at nvidia dot com => a@nvidia.com +a dot b c at nvidia dot com => a.bc@nvidia.com c d f at a b c dot e d u => cdf@abc.edu -a b c at a b c dot com => abc@abc.com a b c at g mail dot a b c => abc@gmail.abc -a dot b c at nvidia dot com => a.bc@nvidia.com +a b c at a b c dot com => abc@abc.com +a s d f one two three at a b c dot com => asdf123@abc.com +a one b two at a b c dot com => a1b2@abc.com +a b three dot s d d dot three at g mail dot com => ab3.sdd.3@gmail.com +one three at g mail dot com => 13@gmail.com +a b three hyphen s d d dash three at g mail dot com => ab3-sdd-3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m => http://www.comdailynews.ab.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m => http://www.comdailynews.ab/sm +w w w dot c o m d a i l y n e w s dot a b slash s m => www.comdailynews.ab/sm +c o m d a i l y n e w s dot a b slash s m => comdailynews.ab/sm +n vidia dot com => nvidia.com +abc at gmail dot com => abc@gmail.com +athreed at gmail dot com => athreed@gmail.com +kore dot ai => kore.ai +dot three at g mail dot com => dot 3@gmail.com diff --git a/itn/english/test/data/en_measure.txt b/itn/english/test/data/en_measure.txt index 9b3aa65..612f31d 100644 --- a/itn/english/test/data/en_measure.txt +++ b/itn/english/test/data/en_measure.txt @@ -1,14 +1,112 @@ two hundred meters => 200 m +fifty six point three per square kilometer => 56.3 /km² two hundred kilometers per hour => 200 km/h +twenty eight kilograms force per square centimeter => 28 kgf/cm² +forty two thousand two hundred fifty nine per square meter => 42259 /m² +minus two thousand twelve kilo liters => -2012 kl minus sixty six kilograms => -66 kg +two kilo watt hours => 2 kWh +one point o o o o two eight cubic deci meters => 1.000028 dm³ +seven point five peta bytes => 7.5 pb three hours => 3 h one milli volt => 1 mv +two cubic meters => 2 m³ ninety grams => 90 g -eight kilograms => 8 kg +one hundred twenty four point three lumens => 124.3 lm +four hundred forty milliliters => 440 ml +thirty one thousand four hundred eighty square feet => 31480 sq ft +one thousand six hundred hours => 1600 h +thirty one thousand four hundred eighty square feet => 31480 sq ft +two square miles => 2 sq mi +zero point one nine square miles => 0.19 sq mi +one thousand five hundred thirty one c c => 1531 cc +three hundred micrometers => 300 μm +sixty five thousand square kilometers => 65000 km² +two miles per hour => 2 mph +two hundred forty five miles per hour => 245 mph +one hundred fifty c c => 150 cc +sixty point two four zero zero kilograms => 60.2400 kg +zero feet => 0 ft +zero foot => 0 ft +two feet => 2 ft +twenty foot => 20 ft +point two meters => .2 m +two square meters => 2 m² eighteen feet => 18 ft +eighteen mega siemens => 18 ms eighteen ounces => 18 oz -eight hundred kilowatts => 800 kW +eighteen point five kilometers => 18.5 km +eighteen point five two square kilometers => 18.52 km² +eighteen point nine one square kilometers => 18.91 km² +eighteen point one four percent => 18.14 % +eighteen point one six percent => 18.16 % +eighteen point one square kilometers => 18.1 km² +eighteen point six percent => 18.6 % +eighteen point two two kilometers => 18.22 km +eighteen point zero kilometers => 18.0 km +eighteen point zero percent => 18.0 % +eighteen square kilometers => 18 km² +eighteen thousand eight hundred giga watt hours => 18800 gWh +eighteen thousand seven hundred hectares => 18700 ha +eight hectares => 8 ha +eight hundred eighty five astronomical units => 885 au +eight hundred eighty hectares => 880 ha +eight hundred eighty kilobytes => 880 kb +eight hundred eighty kilometers => 880 km +eight hundred eighty nine feet => 889 ft +eight hundred eighty six kilometers => 886 km +eight hundred eighty two megawatts => 882 mW +eight hundred feet => 800 ft +eight hundred fifty five square kilometers => 855 km² +eight hundred fifty megahertz => 850 mhz +eight hundred fifty meters => 850 m +eight hundred fifty nanometers => 850 nm +eight hundred fifty one meters => 851 m +eight hundred fifty seven square kilometers => 857 km² +eight hundred fifty three meters => 853 m +eight hundred fifty three point six meters => 853.6 m +eight hundred five point four six square kilometers => 805.46 km² +eight hundred forty two point nine meters => 842.9 m +eight hundred forty two square kilometers => 842 km² +eight hundred gigabytes => 800 gb eight hundred horsepower => 800 hp -fifty six point three per square kilometer => 56.3 /km² -twelve point five meters => 12.5 m -point two meters => .2 m +eight hundred kilograms => 800 kg +eight hundred kilo watt hours => 800 kWh +eight hundred kilowatts => 800 kW +eight hundred megahertz => 800 mhz +eight hundred ninety four c c => 894 cc +eight hundred ninety kilowatts => 890 kW +eight hundred ninety millimeters => 890 mm +eight hundred ninety two square kilometers => 892 km² +eight hundred seventy horsepower => 870 hp +eight hundred seventy meters => 870 m +eight hundred sixty kilograms => 860 kg +eight hundred sixty kilometers => 860 km +eight hundred sixty miles => 860 mi +eight hundred sixty six feet => 866 ft +eight hundred ten hectares => 810 ha +eight hundred ten kilohertz => 810 khz +eight hundred thirty eight point two millimeters => 838.2 mm +eight hundred thirty five kilometers => 835 km +eight hundred thirty kilohertz => 830 khz +eight hundred thirty megawatts => 830 mW +eight hundred thirty nine kilometers => 839 km +eight hundred thirty six meters => 836 m +eight hundred twenty feet => 820 ft +eight hundred twenty kilometers => 820 km +eight hundred twenty meters => 820 m +eight hundred twenty one point zero feet => 821.0 ft +eight hundred two point eight nine kilometers => 802.89 km +eight hundred volts => 800 v +eight kilobits => 8 kb +eight kilograms => 8 kg +eight million two hundred thousand feet => 8200000 ft +eight point eight kilometers => 8.8 km +eight point eight meters => 8.8 m +eight point eight miles => 8.8 mi +eight point five centimeters => 8.5 cm +eight point five five percent => 8.55 % +eight point five megawatts => 8.5 mW +eight point five meters => 8.5 m +eight point five two percent => 8.52 % +eight point four four percent => 8.44 % diff --git a/itn/english/test/data/en_money.txt b/itn/english/test/data/en_money.txt new file mode 100644 index 0000000..b1e5806 --- /dev/null +++ b/itn/english/test/data/en_money.txt @@ -0,0 +1,52 @@ +two dollars => $2 +one cent => $0.01 +four united states dollars and sixty nine cents => $4.69 +seventy five dollars sixty three => $75.63 +twenty nine dollars fifty cents => $29.50 +eleven dollars and fifty one cents => $11.51 +nine hundred ninety three dollars and ninety two cents => $993.92 +four hundred sixty billion won => ₩460 billion +thirty billion yen => ¥30 billion +two point five billion dollars => $2.5 billion +forty five billion dollars => $45 billion +fifty million dollars => $50 million +fifty billion dollars => $50 billion +zero point two million dollars => $0.2 million +fifteen point two billion dollars => $15.2 billion +one point six nine billion yuan => 1.69 billion yuan +one point four three six billion yuan => 1.436 billion yuan +four million yuan => 4 million yuan +one dollar => $1 +fifteen thousand dollars => $15000 +twenty dollar => $20 +twenty point five o six dollars => $20.506 +point five o six dollars => $.506 +eighteen dollars => $18 +eighteen million nine hundred twenty five thousand dollars => $18925000 +eighteen thousand eight hundred fifty four dollars => $18854 +eighteen thousand eight hundred one dollars => $18801 +eighteen thousand eight hundred seventy five dollars => $18875 +eighteen thousand eighty one dollars => $18081 +eighteen thousand fifty two dollars => $18052 +eighteen thousand five hundred forty two dollars => $18542 +eighteen thousand five hundred nineteen dollars => $18519 +eighteen thousand five hundred seventy dollars => $18570 +eighteen thousand five hundred seventy eight dollars => $18578 +eighteen thousand five hundred sixteen dollars => $18516 +eighteen thousand four hundred eighty two dollars => $18482 +eighteen thousand four hundred seventy eight dollars => $18478 +eighteen thousand four hundred sixty eight dollars => $18468 +eighteen thousand nine hundred three dollars => $18903 +eighteen thousand nine hundred twenty nine dollars => $18929 +eighteen thousand ninety five dollars => $18095 +eighteen thousand one hundred seventeen dollars => $18117 +eighteen thousand one hundred twenty eight dollars => $18128 +eighteen thousand one hundred twenty five dollars => $18125 +eighteen thousand one hundred twenty four dollars => $18124 +eighteen thousand one hundred twenty nine dollars => $18129 +one thousand fifty five dollars => $1055 +one fifty five dollars => $155 +fifteen hundred dollars => $1500 +ninety nine hundred dollars => $9900 +ninety nine hundred and fifteen dollars and one cent => $9915.01 +one dollars => one dollars diff --git a/itn/english/test/data/en_ordinal.txt b/itn/english/test/data/en_ordinal.txt index 16e43e6..37592bb 100644 --- a/itn/english/test/data/en_ordinal.txt +++ b/itn/english/test/data/en_ordinal.txt @@ -1,13 +1,34 @@ +one hundredth => 100th +twenty five thousand one hundred eleventh => 25111th +second => 2nd +zeroth => 0th first => 1st second => 2nd third => 3rd fourth => 4th -fifth => 5th eleventh => 11th twelfth => 12th thirteenth => 13th twenty first => 21st -thirty second => 32nd -forty third => 43rd -one hundredth => 100th -one hundred and first => 101st +twenty third => 23rd +one hundred eleventh => 111th +one thousandth => 1000th +one hundred twenty first => 121st +eleven hundred twenty first => 1121st +second => 2nd +tenth => 10th +sixth => 6th +third => 3rd +nineteenth => 19th +third => 3rd +twelfth => 12th +forty eighth => 48th +seventy first => 71st +third => 3rd +forty second => 42nd +seventeenth => 17th +twentieth => 20th +twenty first => 21st +seventh => 7th +second => 2nd +fifth => 5th diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index f1ef0e2..d0130b4 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -1,3 +1,23 @@ one two three one two three five six seven eight => 123-123-5678 -one two three four five six seven eight nine zero => 123-456-7890 -plus one one two three one two three five six seven eight => +1 123-123-5678 +plus nine one one two three one two three five six seven eight => +91 123-123-5678 +plus forty four one two three one two three five six seven eight => +44 123-123-5678 +four one two three one two three five six seven eight => 4 123-123-5678 +zero two three one two three five six seven eight => 023-123-5678 +o two three one two three five six seven eight => 023-123-5678 +oh two three one two three five six seven eight => 023-123-5678 +double oh three one two three five six seven eight => 003-123-5678 +one two three dot one two three dot o dot four o => 123.123.0.40 +one twenty three dot one two three dot o dot four o => 123.123.0.40 +two two five dot double five dot o dot four o => 225.55.0.40 +two two five dot double five dot o dot forty five => 225.55.0.45 +ssn is seven double nine one two three double one three => ssn is 799-12-3113 +seven nine nine => 799 +a b nine => ab9 +a b c => a b c +five w k r a three one => 5wkra31 +x eighty six => x86 +x three eighty six => x386 +r t x forty fifty t i => RTX 4050ti +four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005 +a thirty six => a 36 +a ten eighty p display => a 1080p display diff --git a/itn/english/test/data/en_time.txt b/itn/english/test/data/en_time.txt new file mode 100644 index 0000000..3a04982 --- /dev/null +++ b/itn/english/test/data/en_time.txt @@ -0,0 +1,29 @@ +eight oclock g m t => 08:00 gmt +seven a m e s t => 07:00 a.m. est +two p m => 02:00 p.m. +two thirty => 02:30 +three o'clock => 03:00 +quarter past one => 01:15 +half past three => 03:30 +eight fifty one => 08:51 +eight fifty two => 08:52 +eight forty => 08:40 +eight nineteen => 08:19 +eight o six => 08:06 +eight thirty eight => 08:38 +eight thirty two => 08:32 +eight twenty nine => 08:29 +eleven fifty five p m => 11:55 p.m. +eleven fifty three p m => 11:53 p.m. +eleven forty a m => 11:40 a.m. +eleven forty five a m => 11:45 a.m. +eleven forty p m => 11:40 p.m. +eleven forty six a m => 11:46 a.m. +eleven o six p m => 11:06 p.m. +eleven thirteen a m => 11:13 a.m. +half past twelve => 12:30 +quarter past one => 01:15 +quarter to one => 12:45 +quarter to twelve => 11:45 +set alarm at ten to eleven pm => set alarm at 10:50 p.m. +one min to one am => 12:59 a.m. diff --git a/itn/english/test/data/en_whitelist.txt b/itn/english/test/data/en_whitelist.txt new file mode 100644 index 0000000..96abeb6 --- /dev/null +++ b/itn/english/test/data/en_whitelist.txt @@ -0,0 +1,12 @@ +doctor dao => dr. dao +misses smith => mrs. smith +mister dao => mr. dao +saint george => st. george +i like for example ice cream => i like e.g. ice cream +s and p five hundred => S&P 500 +seven eleven stores => 7-eleven stores +r t x => RTX +cat five e => CAT5e +c u d n n => cuDNN +p c i e x eight => PCIe x8 +l g a eleven fifty => LGA 1150 diff --git a/itn/english/test/data/en_word.txt b/itn/english/test/data/en_word.txt new file mode 100644 index 0000000..b77a69e --- /dev/null +++ b/itn/english/test/data/en_word.txt @@ -0,0 +1,55 @@ + => +, one => , one +, one , two , three , four => , one , two , three , four +e s three => es3 +yahoo! => yahoo! +twenty! => 20 ! +x => x +— => — +aaa => aaa +aabach => aabach +aabenraa => aabenraa +aabye => aabye +aaccessed => aaccessed +aach => aach +aachen's => aachen's +aadri => aadri +aafia => aafia +aagaard => aagaard +aagadu => aagadu +aagard => aagard +aagathadi => aagathadi +aaghart's => aaghart's +aagnes => aagnes +aagomoni => aagomoni +aagon => aagon +aagoo => aagoo +aagot => aagot +aahar => aahar +aahh => aahh +aahperd => aahperd +aaibinterstate => aaibinterstate +aajab => aajab +aakasa => aakasa +aakervik => aakervik +aakirkeby => aakirkeby +aalam => aalam +aalbaek => aalbaek +aaldiu => aaldiu +aalem => aalem +a'ali => a'ali +aalilaassamthey => aalilaassamthey +aalin => aalin +aaliyan => aaliyan +aaliyan's => aaliyan's +aamadu => aamadu +aamara => aamara +aambala => aambala +aamera => aamera +aamer's => aamer's +aamina => aamina +aaminah => aaminah +aamjiwnaang => aamjiwnaang + => +, one => , one +, one , two , three , four => , one , two , three , four diff --git a/itn/english/test/normalizer_test.py b/itn/english/test/normalizer_test.py index 507ead3..676ddb4 100644 --- a/itn/english/test/normalizer_test.py +++ b/itn/english/test/normalizer_test.py @@ -28,6 +28,14 @@ class TestNormalizer: parse_test_case("data/en_cardinal.txt"), parse_test_case("data/en_ordinal.txt"), parse_test_case("data/en_decimal.txt"), + parse_test_case("data/en_date.txt"), + parse_test_case("data/en_time.txt"), + parse_test_case("data/en_money.txt"), + parse_test_case("data/en_measure.txt"), + parse_test_case("data/en_telephone.txt"), + parse_test_case("data/en_electronic.txt"), + parse_test_case("data/en_whitelist.txt"), + parse_test_case("data/en_word.txt"), ) @pytest.mark.parametrize("spoken, written", normalizer_cases) diff --git a/tn/token_parser.py b/tn/token_parser.py index a56f9b0..88ab582 100644 --- a/tn/token_parser.py +++ b/tn/token_parser.py @@ -30,9 +30,9 @@ "date": ["year", "month", "day", "preserve_order"], "fraction": ["sign", "numerator", "denominator"], "measure": ["numerator", "denominator", "value", "units"], - "money": ["currency", "value", "decimal"], - "time": ["hour", "minute", "second", "noon"], - "telephone": ["value"], + "money": ["currency", "value", "decimal", "quantity"], + "time": ["hour", "minute", "second", "noon", "zone"], + "telephone": ["country_code", "number_part"], "electronic": ["username", "domain", "protocol"], }