Skip to content

Commit 6048ce4

Browse files
EN TN Fixes for Issue 166 (#207)
* Rebases the updated main Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Passes Pynini fails SP Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Adjustst the weights on the domain graph Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Enables semiotic classes for SP tests Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reweights the tokenizer Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Updates test cases Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates test cases Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Cleans up ELECTRONIC tagger Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates test cases Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Updates Jenkins Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Enables all CI tests Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * Updates EN TN Cache Signed-off-by: Simon Zuberek <szuberek@nvidia.com> --------- Signed-off-by: Simon Zuberek <szuberek@nvidia.com> Co-authored-by: Simon Zuberek <szuberek@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent f2664c1 commit 6048ce4

File tree

7 files changed

+33
-28
lines changed

7 files changed

+33
-28
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ pipeline {
1313

1414
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
1515
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
16-
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-02-24-0'
16+
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-15-24-0'
1717
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
1818
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
1919
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'

nemo_text_processing/text_normalization/en/data/electronic/domain.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.com dot com
22
.org dot org
33
.gov dot gov
4+
.edu dot edu
45
.uk dot UK
56
.fr dot FR
67
.net dot net

nemo_text_processing/text_normalization/en/taggers/electronic.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
2222
NEMO_DIGIT,
2323
NEMO_NOT_SPACE,
2424
NEMO_SIGMA,
25-
NEMO_SPACE,
2625
NEMO_UPPER,
2726
TO_UPPER,
2827
GraphFst,
@@ -55,6 +54,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
5554
cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT)
5655

5756
accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input")
57+
5858
accepted_common_domains = pynini.project(
5959
pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input"
6060
)
@@ -115,16 +115,19 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
115115
username + domain_graph_with_class_tags,
116116
)
117117

118-
# abc.com, abc.com/123-sm
119-
# when only domain, make sure it starts and end with NEMO_ALPHA
120-
graph |= (
118+
full_stop_accep = pynini.accep(".")
119+
dollar_accep = pynini.accep("$") # Include for the correct transduction of the money graph
120+
excluded_symbols = full_stop_accep | dollar_accep
121+
filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols)
122+
accepted_characters = NEMO_ALPHA | NEMO_DIGIT | filtered_symbols
123+
domain_component = full_stop_accep + pynini.closure(accepted_characters, 2)
124+
graph_domain = (
121125
pynutil.insert('domain: "')
122-
+ pynini.compose(
123-
NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + accepted_common_domains + pynini.closure(NEMO_NOT_SPACE),
124-
domain_graph,
125-
).optimize()
126+
+ (pynini.closure(accepted_characters, 1) + pynini.closure(domain_component, 1))
126127
+ pynutil.insert('"')
127-
)
128+
).optimize()
129+
130+
graph |= pynutil.add_weight(graph_domain, MIN_NEG_WEIGHT)
128131

129132
# www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
130133
graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags

nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class ClassifyFst(GraphFst):
5353
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
5454
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
5555
More details to deployment at NeMo/tools/text_processing_deployment.
56-
56+
5757
Args:
5858
input_case: accepting either "lower_cased" or "cased" input.
5959
deterministic: if True will provide a single transduction option,
@@ -78,11 +78,11 @@ def __init__(
7878
os.makedirs(cache_dir, exist_ok=True)
7979
whitelist_file = os.path.basename(whitelist) if whitelist else ""
8080
far_file = os.path.join(
81-
cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far"
81+
cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far",
8282
)
8383
if not overwrite_cache and far_file and os.path.exists(far_file):
8484
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
85-
logger.info(f'ClassifyFst.fst was restored from {far_file}.')
85+
logger.info(f"ClassifyFst.fst was restored from {far_file}.")
8686
else:
8787
logger.info(f"Creating ClassifyFst grammars.")
8888

@@ -107,7 +107,7 @@ def __init__(
107107
logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes")
108108

109109
start_time = time.time()
110-
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic)
110+
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic,)
111111
measure_graph = measure.fst
112112
logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes")
113113

@@ -157,7 +157,7 @@ def __init__(
157157
time_final = pynini.compose(time_graph, v_time_graph)
158158
date_final = pynini.compose(date_graph, v_date_graph)
159159
range_graph = RangeFst(
160-
time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic
160+
time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic,
161161
).fst
162162
logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes")
163163

@@ -171,10 +171,10 @@ def __init__(
171171
| pynutil.add_weight(ordinal_graph, 1.1)
172172
| pynutil.add_weight(money_graph, 1.1)
173173
| pynutil.add_weight(telephone_graph, 1.1)
174-
| pynutil.add_weight(electonic_graph, 1.1)
174+
| pynutil.add_weight(electonic_graph, 1.11)
175175
| pynutil.add_weight(fraction_graph, 1.1)
176176
| pynutil.add_weight(range_graph, 1.1)
177-
| pynutil.add_weight(serial_graph, 1.1003) # should be higher than the rest of the classes
177+
| pynutil.add_weight(serial_graph, 1.12) # should be higher than the rest of the classes
178178
)
179179

180180
# roman_graph = RomanFst(deterministic=deterministic).fst
@@ -187,7 +187,8 @@ def __init__(
187187
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
188188
punct = pynini.closure(
189189
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
190-
| (pynutil.insert(" ") + punct),
190+
| (pynutil.insert(" ") + punct)
191+
| punct,
191192
1,
192193
)
193194

tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ a@hotmail.fr~a at hotmail dot FR
44
a@hotmail.it~a at hotmail dot IT
55
a@aol.it~a at aol dot IT
66
a@msn.it~a at msn dot IT
7-
cdf@abc.edu~cdf at abc dot EDU
7+
cdf@abc.edu~cdf at abc dot edu
88
abc@gmail.abc~abc at gmail dot ABC
99
abc@abc.com~abc at abc dot com
1010
asdf123@abc.com~asdf one two three at abc dot com
@@ -13,9 +13,9 @@ ab3.sdd.3@gmail.com~ab three dot sdd dot three at gmail dot com
1313
email is abc1@gmail.com~email is abc one at gmail dot com
1414
abs@nvidia.com~abs at NVIDIA dot com
1515
email is a-1.b3_c&-d@gma4i-l.com test~email is a dash one dot b three underscore c ampersand dash d at gma four i dash l dot com test
16-
nvidia.com~NVIDIA dot com
16+
nvidia.com~nvidia dot com
1717
test.com~test dot com
18-
test.abc~test.abc
18+
test.abc~test dot abc
1919
http://www.ourdailynews.com.sm~HTTP colon slash slash WWW dot ourdailynews dot com dot SM
2020
https://www.ourdailynews.com.sm~HTTPS colon slash slash WWW dot ourdailynews dot com dot SM
2121
www.ourdailynews.com.sm~WWW dot ourdailynews dot com dot SM
@@ -24,8 +24,8 @@ sdf@gmail.com.sm~sdf at gmail dot com dot SM
2424
sdf@gmail.com/123-sm~sdf at gmail dot com slash one two three dash SM
2525
sdf@gmail.abc/123-sm~sdf at gmail dot ABC slash one two three dash SM
2626
sdf@gmail.abc/123456-sm~sdf at gmail dot ABC slash one two three four five six dash SM
27-
ourdailynews.com/12-sm~ourdailynews dot com slash one two dash SM
28-
ourdailynews.abc~ourdailynews.abc
27+
ourdailynews.com/12-sm~ourdailynews dot com slash one two dash sm
28+
ourdailynews.abc~ourdailynews dot abc
2929
file:///c/code/NeMo/docs/build/html/processing/intro.html~file colon slash slash slash c slash code slash nemo slash docs slash build slash html slash processing slash intro dot HTML
3030
file:///photos/image.jpg~file colon slash slash slash photos slash image dot jpeg
3131
electronic test.com and test2.uk~electronic test dot com and test two dot UK
@@ -38,4 +38,5 @@ rtxprohelp@exchange.nvidia.com~RTX pro help at exchange dot NVIDIA dot com
3838
enterpriseservices@nvidia.com~enterprise services at NVIDIA dot com
3939
enterprise-services@nvidia.com~enterprise dash services at NVIDIA dot com
4040
https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot com slash DGX dash BASEPOD slash
41-
i can use your card ending in 8876~i can use your card ending in eight eight seven six
41+
i can use your card ending in 8876~i can use your card ending in eight eight seven six
42+
here is mail.nasa.gov.~here is mail dot nasa dot gov.

tests/nemo_text_processing/en/data_text_normalization/test_cases_punctuation.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ a: ``b``, ``c``, ``d``, ``e``, ``f`` or ``g``.~a: "b", "c", "d", "e", "f" or "g"
3939
"And now," he said, "He,~"And now," he said, "He,
4040
80's~eighty's
4141
test .b@gmail.com~test. b at gmail dot com
42-
ourdailynews.abc/123-sm~ourdailynews. abc/one hundred twenty three-sm
4342
123,000012~one hundred and twenty three, zero zero zero zero one two
4443
9000,000th~nine thousand, zero zero zero th
4544
16 July, 1943~the sixteenth of july, nineteen forty three
@@ -59,6 +58,6 @@ dr. Evil~dr. Evil
5958
1!!!!~one!!!!
6059
(1)Hello~(one) Hello
6160
ÀÁÂÃ check §- and ƛ, also ɧ~ÀÁÂÃ check section - and ƛ, also ɧ
62-
Hi it's 5pm,4A.M.?-34. Hi,no,yes,34! 12,again,4 and NO?17 and $.01,here & there--0.004kg~Hi it's five PM, four AM.? minus thirty four. Hi,no,yes, thirty four! twelve, again, four and NO? seventeen and one cent, here and there - minus zero point zero zero four kilograms
61+
Hi it's 5pm,4A.M.?-34. Hi,no,yes,34! 12,again,4 and NO?17 and $.01,here & there -0.004kg~Hi it's five PM, four AM.? minus thirty four. Hi,no,yes, thirty four! twelve, again, four and NO? seventeen and one cent, here and there minus zero point zero zero four kilograms
6362
1°C.~one degree Celsius.
6463
my email is myemail@gmail.com!~my email is myemail at gmail dot com!

tests/nemo_text_processing/en/data_text_normalization/test_cases_punctuation_match_input.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ a: ``b``,``c``, ``d``, ``e``, ``f`` or ``g``.~a: "b","c", "d", "e", "f" or "g".
1010
1 ,~one ,
1111
(1)Hello~(one)Hello
1212
ÀÁÂÃ check §- and ƛ, also ɧ~ÀÁÂÃ check section- and ƛ, also ɧ
13-
Hi it's 5pm,4A.M.?-34. Hi,no,yes,34! 12,again,4 and NO?17 and $.01,here & there--0.004kg~Hi it's five PM,four AM.?minus thirty four. Hi,no,yes,thirty four! twelve,again,four and NO?seventeen and one cent,here and there - minus zero point zero zero four kilograms
13+
Hi it's 5pm,4A.M.?-34. Hi,no,yes,34! 12,again,4 and NO?17 and $.01,here & there -0.004kg~Hi it's five PM,four AM.?minus thirty four. Hi,no,yes,thirty four! twelve,again,four and NO?seventeen and one cent,here and there minus zero point zero zero four kilograms

0 commit comments

Comments
 (0)