From 06676cf554a3282a28aeb1c6ae396ba45e96ea30 Mon Sep 17 00:00:00 2001 From: Eric Joanis Date: Tue, 24 Aug 2021 14:42:44 -0400 Subject: [PATCH 1/3] fix: prepare also needs to declare its files as utf8 --- readalongs/cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/readalongs/cli.py b/readalongs/cli.py index 62a761b6..31b0d0c1 100644 --- a/readalongs/cli.py +++ b/readalongs/cli.py @@ -304,7 +304,7 @@ def epub(**kwargs): context_settings=CONTEXT_SETTINGS, short_help="Convert a plain text file into the XML format for alignment.", ) -@click.argument("plaintextfile", type=click.File("r")) +@click.argument("plaintextfile", type=click.File("r", encoding="utf8")) @click.argument("xmlfile", type=click.Path(), required=False, default="") @click.option("-d", "--debug", is_flag=True, help="Add debugging messages to logger") @click.option( @@ -352,10 +352,10 @@ def prepare(**kwargs): out_file += ".xml" if out_file == "-": - filehandle, filename = create_input_tei( + _, filename = create_input_tei( input_file_handle=input_file, text_language=kwargs["language"], ) - with io.open(filename) as f: + with io.open(filename, encoding="utf8") as f: sys.stdout.write(f.read()) else: if not out_file.endswith(".xml"): @@ -365,7 +365,7 @@ def prepare(**kwargs): "Output file %s exists already, use -f to overwrite." % out_file ) - filehandle, filename = create_input_tei( + _, filename = create_input_tei( input_file_handle=input_file, text_language=kwargs["language"], output_file=out_file, From 230d39694153b115d0d841f93205c321f3754315 Mon Sep 17 00:00:00 2001 From: Eric Joanis Date: Tue, 24 Aug 2021 16:21:44 -0400 Subject: [PATCH 2/3] fix: make readalongs prepare Windows compatible - open the input file in utf-8 mode - don't open the input file until we need it, so we don't keep it open in case of error (causes unit testing errors when file handles are not closed so that temp files cannot be deleted) - the click file name for stdin is "-" on Windows, "" on Linux --- readalongs/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/readalongs/cli.py b/readalongs/cli.py index 31b0d0c1..08a7d2f4 100644 --- a/readalongs/cli.py +++ b/readalongs/cli.py @@ -304,7 +304,7 @@ def epub(**kwargs): context_settings=CONTEXT_SETTINGS, short_help="Convert a plain text file into the XML format for alignment.", ) -@click.argument("plaintextfile", type=click.File("r", encoding="utf8")) +@click.argument("plaintextfile", type=click.File("r", encoding="utf8", lazy=True)) @click.argument("xmlfile", type=click.Path(), required=False, default="") @click.option("-d", "--debug", is_flag=True, help="Add debugging messages to logger") @click.option( @@ -344,7 +344,8 @@ def prepare(**kwargs): out_file = kwargs["xmlfile"] if not out_file: out_file = get_click_file_name(input_file) - if out_file == "": # actual intput_file.name when cli input is "-" + print(f"input_file={out_file}") + if out_file in ("", "-"): # intput_file.name is on Linux, - on Windows, when cli input is "-" out_file = "-" else: if out_file.endswith(".txt"): From 2598647eee4683f63b2bfa80711069f8b5e7f7c0 Mon Sep 17 00:00:00 2001 From: Eric Joanis Date: Tue, 24 Aug 2021 17:33:13 -0400 Subject: [PATCH 3/3] fix: more Windows compatiblity issues fixed - skip the unit test that depends on making a directory non-writable, since Windows won't let us do it. - test_g2p_cli.py: write temp test files in utf8 - readalongs g2p: open input file in utf8 - make get_click_file_name() work on both Windows and Linux and in unit testing consistently - Failing to delete the failed tempfile is not a fatal error in align.py, just proceed! --- readalongs/align.py | 5 ++++- readalongs/cli.py | 35 ++++++++++++++++++++--------------- test/test_align_cli.py | 6 ++++++ test/test_g2p_cli.py | 6 +++--- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/readalongs/align.py b/readalongs/align.py index 8892f055..f1638b33 100644 --- a/readalongs/align.py +++ b/readalongs/align.py @@ -186,7 +186,10 @@ def align_audio( # noqa: C901 save_temps + "_processed" + ext, format=ext[1:] ) except CouldntEncodeError: - os.remove(save_temps + "_processed" + ext) + try: + os.remove(save_temps + "_processed" + ext) + except: + pass LOGGER.warning( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) diff --git a/readalongs/cli.py b/readalongs/cli.py index 08a7d2f4..b5ac6ffd 100644 --- a/readalongs/cli.py +++ b/readalongs/cli.py @@ -63,11 +63,23 @@ def create_app(): def get_click_file_name(click_file): - """ Return click_file.name, falling back to if the .name attribute is missing. """ + """ Wrapper around click_file.name with consistent handling for stdin + + On Windows, if click_file is stdin, click_file.name == "-". + On Linux, if click_file is stdin, click_file.name == "". + During unit testing, the simulated stdin stream has no .name attribute + + Args: + click_file(click.File): the click file whose name we need + + Returns: + "-" if click_file represents stdin, click_file.name otherwise + """ try: - return click_file.name - except Exception: # For unit testing: simulated stdin stream has no .name attrib - return "" + name = click_file.name + except Exception: + name = "-" + return "-" if name == "" else name def parse_g2p_fallback(g2p_fallback_arg): @@ -344,10 +356,7 @@ def prepare(**kwargs): out_file = kwargs["xmlfile"] if not out_file: out_file = get_click_file_name(input_file) - print(f"input_file={out_file}") - if out_file in ("", "-"): # intput_file.name is on Linux, - on Windows, when cli input is "-" - out_file = "-" - else: + if out_file != "-": if out_file.endswith(".txt"): out_file = out_file[:-4] out_file += ".xml" @@ -408,9 +417,7 @@ def tokenize(**kwargs): if not kwargs["tokfile"]: output_path = get_click_file_name(input_file) - if output_path == "": - output_path = "-" - else: + if output_path != "-": if output_path.endswith(".xml"): output_path = output_path[:-4] output_path += ".tokenized.xml" @@ -447,7 +454,7 @@ def tokenize(**kwargs): short_help="Apply g2p to a tokenized file, like 'align' does.", # NOT TRUE YET: "Apply g2p to a tokenized file, in preparation for alignment." ) -@click.argument("tokfile", type=click.File("rb")) +@click.argument("tokfile", type=click.File("rb", encoding="utf8", lazy=True)) @click.argument("g2pfile", type=click.Path(), required=False, default="") @click.option( "--g2p-fallback", @@ -488,9 +495,7 @@ def g2p(**kwargs): if not kwargs["g2pfile"]: output_path = get_click_file_name(input_file) - if output_path == "": - output_path = "-" - else: + if output_path != "-": if output_path.endswith(".xml"): output_path = output_path[:-4] if output_path.endswith(".tokenized"): diff --git a/test/test_align_cli.py b/test/test_align_cli.py index e27d0d17..e489b472 100755 --- a/test/test_align_cli.py +++ b/test/test_align_cli.py @@ -167,6 +167,12 @@ def test_invoke_align(self): ) def test_permission_denied(self): + import platform + + if platform.system() == "Windows": + # Cannot change the permission on a directory in Windows though + # os.mkdir() or os.chmod(), so skip this test + return dir = join(self.tempdir, "permission_denied") os.mkdir(dir, mode=0o444) results = self.runner.invoke( diff --git a/test/test_g2p_cli.py b/test/test_g2p_cli.py index 77f2a90c..8c040f7f 100755 --- a/test/test_g2p_cli.py +++ b/test/test_g2p_cli.py @@ -96,7 +96,7 @@ def test_mixed_langs(self): # saving the final results into filename. # filename is assumed to be inside self.tempdir, so we count on tearDown() to clean up. def write_prepare_tokenize(self, text, lang, filename): - with open(filename + ".input.txt", "w") as f: + with open(filename + ".input.txt", "w", encoding="utf8") as f: print(text, file=f) self.runner.invoke( prepare, ["-l", lang, filename + ".input.txt", filename + ".prepared.xml"] @@ -197,7 +197,7 @@ def test_three_way_fallback(self): def test_align_with_error(self): text_file = os.path.join(self.tempdir, "input.txt") - with io.open(text_file, "w") as f: + with io.open(text_file, "w", encoding="utf8") as f: print("In French été works but Nunavut ᓄᓇᕗᑦ does not.", file=f) empty_wav = os.path.join(self.tempdir, "empty.wav") with io.open(empty_wav, "wb"): @@ -241,7 +241,7 @@ def test_align_with_error(self): def test_with_stdin(self): input_file = os.path.join(self.data_dir, "fra-tokenized.xml") - with io.open(input_file) as f: + with io.open(input_file, encoding="utf8") as f: inputtext = f.read() results = self.runner.invoke(g2p, "-", input=inputtext) self.assertEqual(results.exit_code, 0)