Skip to content

Commit e23f6e1

Browse files
Fix for nv-bug 4786175 (#213)
* Implements the fix Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Updates test cases for MONEY Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Updates the cache Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Simon Zuberek <szuberek@nvidia.com> Co-authored-by: Simon Zuberek <szuberek@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 6048ce4 commit e23f6e1

File tree

6 files changed

+60
-21
lines changed

6 files changed

+60
-21
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ pipeline {
1313

1414
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
1515
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
16-
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-15-24-0'
16+
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-20-24-0'
1717
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
1818
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
1919
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'

nemo_text_processing/text_normalization/en/data/measure/unit.tsv

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,12 @@ ps PS
124124
s S
125125
tb TB
126126
tb YB
127-
zb ZB
127+
zb ZB
128+
sec second
129+
min minute
130+
hr hour
131+
d day
132+
wk week
133+
mo month
134+
yr year
135+
svc service
Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,11 @@
11
/ea each
2-
/dozen
2+
/dozen per dozen
3+
/sec per second
4+
/min per minute
5+
/hr per hour
6+
/h per hour
7+
/d per day
8+
/wk per week
9+
/mo per month
10+
/yr per year
11+
/svc per service

nemo_text_processing/text_normalization/en/taggers/money.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@
2929
min_singular = pynini.string_file(get_abs_path("data/money/currency_minor_singular.tsv"))
3030
min_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv"))
3131
maj_singular = pynini.string_file((get_abs_path("data/money/currency_major.tsv")))
32+
per_units = pynini.string_file(get_abs_path("data/money/per_unit.tsv"))
3233

3334

3435
class MoneyFst(GraphFst):
3536
"""
36-
Finite state transducer for classifying money, suppletive aware, e.g.
37+
Finite state transducer for classifying money, suppletive aware, e.g.
3738
$12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
3839
$12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
3940
$1 -> money { currency_maj: "dollar" integer_part: "one" }
@@ -59,14 +60,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool =
5960
maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
6061
maj_unit_singular = convert_space(maj_singular)
6162

62-
graph_maj_singular = pynutil.insert("currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
63-
graph_maj_plural = pynutil.insert("currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")
63+
graph_maj_singular = pynutil.insert('currency_maj: "') + maj_unit_singular + pynutil.insert('"')
64+
graph_maj_plural = pynutil.insert('currency_maj: "') + maj_unit_plural + pynutil.insert('"')
6465

6566
optional_delete_fractional_zeros = pynini.closure(
6667
pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1
6768
)
6869

69-
graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")
70+
graph_integer_one = pynutil.insert('integer_part: "') + pynini.cross("1", "one") + pynutil.insert('"')
7071
# only for decimals where third decimal after comma is non-zero or with quantity
7172
decimal_delete_last_zeros = (
7273
pynini.closure(NEMO_DIGIT | pynutil.delete(","))
@@ -81,9 +82,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool =
8182
graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
8283
)
8384

84-
graph_integer = (
85-
pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")
86-
)
85+
graph_integer = pynutil.insert('integer_part: "') + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert('"')
8786

8887
graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
8988
graph_integer_only |= graph_maj_plural + insert_space + graph_integer
@@ -101,8 +100,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool =
101100
| ((NEMO_DIGIT - "0") + NEMO_DIGIT)
102101
)
103102

104-
graph_min_singular = pynutil.insert(" currency_min: \"") + min_singular + pynutil.insert("\"")
105-
graph_min_plural = pynutil.insert(" currency_min: \"") + min_plural + pynutil.insert("\"")
103+
graph_min_singular = pynutil.insert(' currency_min: "') + min_singular + pynutil.insert('"')
104+
graph_min_plural = pynutil.insert(' currency_min: "') + min_plural + pynutil.insert('"')
106105
# format ** dollars ** cent
107106
decimal_graph_with_minor = None
108107
integer_graph_reordered = None
@@ -113,19 +112,19 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool =
113112
integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular
114113

115114
integer_plus_maj_with_comma = pynini.compose(
116-
NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj
115+
NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj,
117116
)
118117
integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
119118
integer_plus_maj |= integer_plus_maj_with_comma
120119

121120
graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "one")
122-
graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"")
121+
graph_fractional_one = pynutil.insert('fractional_part: "') + graph_fractional_one + pynutil.insert('"')
123122
graph_fractional = (
124123
two_digits_fractional_part
125124
@ (pynini.closure(NEMO_DIGIT, 1, 2) - "1")
126125
@ cardinal.graph_hundred_component_at_least_one_none_zero_digit
127126
)
128-
graph_fractional = pynutil.insert("fractional_part: \"") + graph_fractional + pynutil.insert("\"")
127+
graph_fractional = pynutil.insert('fractional_part: "') + graph_fractional + pynutil.insert('"')
129128

130129
fractional_plus_min = graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural
131130
fractional_plus_min |= (
@@ -138,9 +137,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool =
138137
decimal_graph_with_minor_curr |= pynutil.add_weight(
139138
integer_plus_maj
140139
+ pynini.cross(".", " ")
141-
+ pynutil.insert("fractional_part: \"")
140+
+ pynutil.insert('fractional_part: "')
142141
+ two_digits_fractional_part @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
143-
+ pynutil.insert("\""),
142+
+ pynutil.insert('"'),
144143
weight=0.0001,
145144
)
146145
default_fraction_graph = (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
@@ -183,11 +182,16 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool =
183182
# weight for SH
184183
final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)
185184

185+
# utilizes morphosyntactic features to append "per units"
186+
graph_per_units = pynutil.insert(' morphosyntactic_features: "') + per_units + pynutil.insert('"')
187+
186188
if not deterministic:
187189
final_graph |= integer_graph_reordered | decimal_default_reordered
188190
# to handle "$2.00" cases
189191
final_graph |= pynini.compose(
190-
NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered
192+
NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered,
191193
)
194+
195+
final_graph += graph_per_units.ques
192196
final_graph = self.add_tokens(final_graph.optimize())
193197
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/en/verbalizers/money.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
delete_preserve_order,
2323
)
2424

25+
from nemo_text_processing.text_normalization.en.utils import get_abs_path
26+
27+
per_units = pynini.string_file(get_abs_path("data/money/per_unit.tsv"))
28+
2529

2630
class MoneyFst(GraphFst):
2731
"""
@@ -37,11 +41,11 @@ class MoneyFst(GraphFst):
3741
def __init__(self, decimal: GraphFst, deterministic: bool = True):
3842
super().__init__(name="money", kind="verbalize", deterministic=deterministic)
3943
keep_space = pynini.accep(" ")
40-
maj = pynutil.delete("currency_maj: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
41-
min = pynutil.delete("currency_min: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
44+
maj = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"')
45+
min = pynutil.delete('currency_min: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"')
4246

4347
fractional_part = (
44-
pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
48+
pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"')
4549
)
4650

4751
integer_part = decimal.integer
@@ -68,5 +72,14 @@ def __init__(self, decimal: GraphFst, deterministic: bool = True):
6872
if not deterministic:
6973
graph |= graph_integer + delete_preserve_order
7074

75+
per_units_normalized = pynini.project(per_units, "output")
76+
remove_per_units_normalized = (
77+
pynutil.delete(' morphosyntactic_features: "')
78+
+ pynutil.insert(" ")
79+
+ per_units_normalized
80+
+ pynutil.delete('" ')
81+
)
82+
graph += remove_per_units_normalized.ques
83+
7184
delete_tokens = self.delete_tokens(graph)
7285
self.fst = delete_tokens.optimize()

tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,8 @@ $1,234.123~one thousand two hundred and thirty four point one two three dollars
6464
US $76.3 trillion~US seventy six point three trillion dollars
6565
US$76.3 trillion~seventy six point three trillion us dollars
6666
The price for each canned salmon is $5 , each bottle of peanut butter is $3~The price for each canned salmon is five dollars , each bottle of peanut butter is three dollars
67+
$20/mo is what we are currently charging.~twenty dollars per month is what we are currently charging.
68+
$350/yr is the fee.~three hundred and fifty dollars per year is the fee.
69+
£10/wk sounds good to us.~ten pounds per week sounds good to us.
70+
$1/d is the new subscription cost.~one dollar per day is the new subscription cost.
71+
$0.5/hr is the total cost.~fifty cents per hour is the total cost.

0 commit comments

Comments
 (0)