diff --git a/requirements.txt b/requirements.txt index 138f082..5441c7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ torch numpy requests tqdm +google-compute-engine +gensim diff --git a/src/iranlowo/adr.py b/src/iranlowo/adr.py index 1012213..e57e730 100644 --- a/src/iranlowo/adr.py +++ b/src/iranlowo/adr.py @@ -4,272 +4,11 @@ from __future__ import unicode_literals import pkg_resources -import re -import unicodedata - from argparse import Namespace -from collections import defaultdict from onmt.translate.translator import build_translator from onmt.utils.parse import ArgumentParser -def strip_accents_text(text_string): - """ - Converts the string to NFD, separates & returns only the base characters - :param text_string: - :return: input string without diacritic adornments on base characters - """ - return "".join( - c - for c in unicodedata.normalize("NFD", text_string) - if unicodedata.category(c) != "Mn" - ) - - -def strip_accents_file(filename, outfilename): - """ - Reads filename containing diacritics, converts to NFC for consistency, - then writes outfilename with diacritics removed - :param filename: - :param outfilename: - :return: None - """ - text = "".join( - c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) - ) - try: - f = open(outfilename, "w") - except EnvironmentError: - return False - else: - with f: - f.write(strip_accents_text(text)) - return True - - -def is_file_nfc(path): - """ - - Args: - path: File path - - Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct - - """ - text = open(path).read() - return is_text_nfc(text) - - -def is_text_nfc(text): - """Validate unicode form of given text""" - nfc_text = "".join(c for c in unicodedata.normalize("NFC", text)) - if nfc_text == text: - return True - else: - return False - - -def normalize_diacritics_text(text_string): - """Convenience wrapper to abstract away unicode & NFC""" - return unicodedata.normalize("NFC", text_string) - - -def normalize_diacritics_file(filename, outfilename): - """File based Convenience wrapper to abstract away unicode & NFC""" - try: - text = "".join( - c - for c in unicodedata.normalize( - "NFC", open(filename, encoding="utf-8").read() - ) - ) - with open(outfilename, "w", encoding="utf-8") as f: - f.write(text) - except EnvironmentError: - return False - else: - return True - - -def file_info(filename): - """File metadata useful for various ADR tasks""" - - print("\nFilename: " + filename) - print("---------------------------------") - - lines = tuple(open(filename, "r", encoding="utf-8")) - num_utts = len(lines) - - text = "".join( - c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) - ) - words = re.findall("\w+", text) - num_words = len(words) - num_chars = len(re.findall(r"\S", text)) - - unique_chars = set(text) - num_uniq_chars = len(unique_chars) - - print(sorted(unique_chars)) - print("# utts : " + str(num_utts)) - print("# chars : " + str(num_chars)) - print("# uniq chars: " + str(num_uniq_chars)) - - # unaccented word stats - unaccented_words = 0 - for word in words: - if word == strip_accents_text(word): - unaccented_words += 1 - - print("# total words: " + str(num_words)) - print("# unaccented words : " + str(unaccented_words)) - print("-----------------------------------------------") - - # ambiguous word stats - ambiguity_map = defaultdict(set) - for word in words: - no_accents = strip_accents_text(word) - ambiguity_map[no_accents].add(word) - - ambiguous_words = 0 - ambiguous_words_2 = 0 - ambiguous_words_3 = 0 - ambiguous_words_4 = 0 - ambiguous_words_5 = 0 - ambiguous_words_6 = 0 - ambiguous_words_7 = 0 - ambiguous_words_8 = 0 - ambiguous_words_9 = 0 - - # fill ambiguity map - for word in ambiguity_map: - if len(ambiguity_map[word]) > 1: - ambiguous_words += 1 - if len(ambiguity_map[word]) == 2: - ambiguous_words_2 += 1 - elif len(ambiguity_map[word]) == 3: - ambiguous_words_3 += 1 - elif len(ambiguity_map[word]) == 4: - ambiguous_words_4 += 1 - elif len(ambiguity_map[word]) == 5: - ambiguous_words_5 += 1 - elif len(ambiguity_map[word]) == 6: - ambiguous_words_6 += 1 - elif len(ambiguity_map[word]) == 7: - ambiguous_words_7 += 1 - elif len(ambiguity_map[word]) == 8: - ambiguous_words_8 += 1 - elif len(ambiguity_map[word]) == 9: - ambiguous_words_9 += 1 - - # print ambiguity map - for word in ambiguity_map: - if len(ambiguity_map[word]) == 2: - print("# 2: " + str(ambiguity_map[word])) - if len(ambiguity_map[word]) == 3: - print("# 3: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 4: - print("# 4: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 5: - print("# 5: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 6: - print("# 6: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 7: - print("# 7: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 8: - print("# 8: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 9: - print("# 9: " + str(ambiguity_map[word])) - - print("# unique ambiguous words : " + str(ambiguous_words)) - print("# total unique non-diacritized words : " + str(len(ambiguity_map))) - - unique_all_words = set() - for word in words: - unique_all_words.add(word) - - print("# total unique words : " + str(len(unique_all_words))) - print("-----------------------------------------------") - print("# ambiguous 2 words : " + str(ambiguous_words_2)) - print("# ambiguous 3 words : " + str(ambiguous_words_3)) - print("# ambiguous 4 words : " + str(ambiguous_words_4)) - print("# ambiguous 5 words : " + str(ambiguous_words_5)) - print("# ambiguous 6 words : " + str(ambiguous_words_6)) - print("# ambiguous 7 words : " + str(ambiguous_words_7)) - print("# ambiguous 8 words : " + str(ambiguous_words_8)) - print("# ambiguous 9 words : " + str(ambiguous_words_9)) - - -def split_corpus_on_symbol(filename, outfilename, symbol=","): - """ - For yoruba blog (and probably bibeli mimo) - - Args: filenames for I/O and symbol to split lines on - Returns: writes outputfile - :param filename: input file - :param outfilename: processed output file to write - :param symbol: to split lines on - :return: None, with side-effect of writing an outputfile - """ - - lines = tuple(open(filename, "r", encoding="utf-8")) - - min_words_to_split = 10 - min_words_in_utt = 5 - - with open(outfilename, "w") as f: - # split out heavily comma'd text :(( - for line in lines: - if symbol in line: - num_words = len(line.split()) - num_commas = line.count(symbol) - curr_comma_position = line.index(symbol) - num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split()) - - curr_line = line - while num_commas > 0: - if num_words < min_words_to_split: - # print(curr_line.strip()) - f.write(curr_line) - break - if num_words >= min_words_to_split: - if ( - num_words_ahead_of_curr_comma >= min_words_in_utt - and len((curr_line)[curr_comma_position:].split()) - >= min_words_in_utt - ): - f.write((curr_line)[0:curr_comma_position] + "\n") - - # update vars - curr_line = curr_line[curr_comma_position + 1:] - num_words = len(curr_line.split()) - num_commas = num_commas - 1 - if num_commas > 0: - curr_comma_position = curr_line.index(symbol) - num_words_ahead_of_curr_comma = len( - curr_line[0:curr_comma_position].split() - ) - else: - f.write(curr_line) - else: - # ignore too short comma (+= vs = on current comma position) - num_commas = num_commas - 1 - if num_commas > 0: # for say 3 commas - curr_comma_position += ( - curr_line[curr_comma_position + 1:].index(symbol) - + 1 - ) - num_words_ahead_of_curr_comma = len( - curr_line[0:curr_comma_position].split() - ) - else: - f.write(curr_line) - else: - f.write(curr_line) - else: - f.write(line) - - def diacritize_text(undiacritized_text, verbose=False): # manually construct the options so we don't have to pass them in. opt = Namespace() @@ -339,13 +78,3 @@ def diacritize_text(undiacritized_text, verbose=False): ) return prediction[0][0] - -if __name__ == "__main__": - # # test - print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) # NFD - print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) # NFC - print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Book_of_Mormon/cleaned/doctrine_and_covenants.txt')) - - print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Owe/yoruba_proverbs_out.txt')) - - # file_info("../../tests/testdata/nfc.txt") diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py new file mode 100644 index 0000000..4201d4e --- /dev/null +++ b/src/iranlowo/corpus/__init__.py @@ -0,0 +1,2 @@ +from .corpus import Corpus, DirectoryCorpus +from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus \ No newline at end of file diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py new file mode 100644 index 0000000..81fa2ff --- /dev/null +++ b/src/iranlowo/corpus/corpus.py @@ -0,0 +1,132 @@ +import gzip +import os + +from gensim import interfaces +from gensim.corpora.csvcorpus import CsvCorpus +from gensim.corpora.textcorpus import walk + +from iranlowo.preprocessing import is_valid_owé_format, normalize_diacritics_text +from iranlowo.utils import is_text_nfc + + +class Corpus(interfaces.CorpusABC): + def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None): + """ + + Args: + path: + text: + """ + self.path = path + self.text = text + self.labels = labels + self.stream = stream + self.fformat = fformat + self.cformat = cformat + self.preprocess = preprocess + assert self.path or self.text, "You should pass either a path or text to read data from." + if not self.preprocess: + self.preprocess = [normalize_diacritics_text] + self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text() + self.validate_format() + + def __iter__(self): + for line in self.data: + yield line + + def __len__(self): + return len(self.data) + + @staticmethod + def save_corpus(fname, corpus, id2word=None, metadata=False): + pass + + def streamfile(self, fobj): + num_text = 0 + with fobj as obj: + for line in obj: + num_text += 1 + yield line + + def read_file_filename_or_text(self, f=None, text=None): + """ + + Returns: + + """ + path = f if f else self.path + out = [] + if text: + return self.handle_preprocessing(text) if self.preprocess else text + elif isinstance(path, list): + for f in path: + text = self.read_file_filename_or_text(f) + out.append(text) + return out + else: + if isinstance(path, str): + if self.fformat == "txt": + text = open(path) + elif self.fformat == "csv": + text = CsvCorpus(path, self.labels) + elif self.fformat == 'gzip': + text = gzip.open(path) + else: + text = self.path.seek(0) + + text = text.read() if not self.stream else ''.join(list(self.streamfile(text))) + return self.handle_preprocessing(text) if self.preprocess else text + + def handle_preprocessing(self, text): + if callable(self.preprocess): + return self.preprocess(text) + if isinstance(self.preprocess, list): + for technique in self.preprocess: + text = technique(text) + return text + + def validate_format(self): + """ + + Returns: + + """ + data = self.data + if isinstance(data, list): + data = ''.join(data) + if not self.cformat and not is_text_nfc(data): + raise TypeError("The corpus does not comply to the NFC corpus format") + elif self.cformat == "owe": + if not is_valid_owé_format(data): + raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat)) + else: + return True + + def generate(self, size): + """ + + Args: + size: + + Returns: + + """ + if not self.cformat: + raise ValueError("You need to specify a format for generating random text") + + +class DirectoryCorpus(Corpus): + def __init__(self, path, **kwargs): + self.dir_path = path + self.depth = kwargs.get('min_depth', 0) + self.path = list(self.read_files()) + super(DirectoryCorpus, self).__init__(path=self.path, **kwargs) + + def read_files(self): + walked = list(walk(self.dir_path)) + if not walked: + raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path)) + for depth, dirpath, _, filenames in walked: + if self.depth <= depth: + for path in filenames: + yield os.path.join(dirpath, path) diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py new file mode 100644 index 0000000..1314af6 --- /dev/null +++ b/src/iranlowo/corpus/loaders.py @@ -0,0 +1,66 @@ +import os + +from iranlowo.corpus import Corpus, DirectoryCorpus + + +class BaseLoader(object): + def __init__(self, corpus_path): + self.corpus_path = corpus_path + yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None) + if not yoruba_text_path: + raise NotADirectoryError( + "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's " + "path") + else: + corpus_path = "{}/{}".format(yoruba_text_path, corpus_path) + self.path = corpus_path + + +class YorubaBlogCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs) + + +class BBCCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(BBCCorpus, self).__init__(path=self.path, **kwargs) + super().__init__(path) + + +class BibeliCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(BibeliCorpus, self).__init__(path=self.path, **kwargs) + + +class en(BaseLoader, DirectoryCorpus): + def __init__(self): + BaseLoader.__init__(self, corpus_path="Owe/en") + DirectoryCorpus.__init__(self, path=self.path) + + +class yo(BaseLoader, DirectoryCorpus): + def __init__(self): + BaseLoader.__init__(self, corpus_path="Owe/yo") + DirectoryCorpus.__init__(self, path=self.path) + + +class OweLoader(object): + def __init__(self): + self.en = en() + self.yo = yo() + diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py new file mode 100644 index 0000000..e69de29 diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py new file mode 100644 index 0000000..5ba518e --- /dev/null +++ b/src/iranlowo/interfaces.py @@ -0,0 +1,14 @@ +import scrapy + + +class Scrapper(scrapy.Spider): + """ + Interface for scrapping data from :mod:`iranlowo.scrapper` + """ + + def __init__(self, name, urls, **kwargs): + super(Scrapper, self).__init__(name, **kwargs) + + def parse(self, response): + pass + diff --git a/src/iranlowo/preprocessing.py b/src/iranlowo/preprocessing.py index fe72ee7..73545fe 100644 --- a/src/iranlowo/preprocessing.py +++ b/src/iranlowo/preprocessing.py @@ -1,5 +1,6 @@ import csv import gzip +import unicodedata from pathlib import Path @@ -90,3 +91,130 @@ def get_chunk(txt, n): except IndexError: pass # End of file reached + +def strip_accents_text(text_string): + """ + Converts the string to NFD, separates & returns only the base characters + :param text_string: + :return: input string without diacritic adornments on base characters + """ + return "".join( + c + for c in unicodedata.normalize("NFD", text_string) + if unicodedata.category(c) != "Mn" + ) + + +def strip_accents_file(filename, outfilename): + """ + Reads filename containing diacritics, converts to NFC for consistency, + then writes outfilename with diacritics removed + :param filename: + :param outfilename: + :return: None + """ + text = "".join( + c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) + ) + try: + f = open(outfilename, "w") + except EnvironmentError: + return False + else: + with f: + f.write(strip_accents_text(text)) + return True + + +def normalize_diacritics_text(text_string): + """Convenience wrapper to abstract away unicode & NFC""" + return unicodedata.normalize("NFC", text_string) + + +def normalize_diacritics_file(filename, outfilename): + """File based Convenience wrapper to abstract away unicode & NFC""" + try: + text = "".join( + c + for c in unicodedata.normalize( + "NFC", open(filename, encoding="utf-8").read() + ) + ) + with open(outfilename, "w", encoding="utf-8") as f: + f.write(text) + except EnvironmentError: + return False + else: + return True + + +def split_corpus_on_symbol(filename, outfilename, symbol=","): + """ + For yoruba blog (and probably bibeli mimo) + + Args: filenames for I/O and symbol to split lines on + Returns: writes outputfile + :param filename: input file + :param outfilename: processed output file to write + :param symbol: to split lines on + :return: None, with side-effect of writing an outputfile + """ + + lines = tuple(open(filename, "r", encoding="utf-8")) + + min_words_to_split = 10 + min_words_in_utt = 5 + + with open(outfilename, "w") as f: + # split out heavily comma'd text :(( + for line in lines: + if symbol in line: + num_words = len(line.split()) + num_commas = line.count(symbol) + curr_comma_position = line.index(symbol) + num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split()) + + curr_line = line + while num_commas > 0: + if num_words < min_words_to_split: + # print(curr_line.strip()) + f.write(curr_line) + break + if num_words >= min_words_to_split: + if ( + num_words_ahead_of_curr_comma >= min_words_in_utt + and len(curr_line[curr_comma_position:].split()) + >= min_words_in_utt + ): + f.write(curr_line[0:curr_comma_position] + "\n") + + # update vars + curr_line = curr_line[curr_comma_position + 1:] + num_words = len(curr_line.split()) + num_commas = num_commas - 1 + if num_commas > 0: + curr_comma_position = curr_line.index(symbol) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + f.write(curr_line) + else: + # ignore too short comma (+= vs = on current comma position) + num_commas = num_commas - 1 + if num_commas > 0: # for say 3 commas + curr_comma_position += ( + curr_line[curr_comma_position + 1:].index(symbol) + + 1 + ) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + f.write(curr_line) + else: + f.write(curr_line) + else: + f.write(line) + + diff --git a/src/iranlowo/utils.py b/src/iranlowo/utils.py new file mode 100644 index 0000000..f4ab2c5 --- /dev/null +++ b/src/iranlowo/utils.py @@ -0,0 +1,142 @@ +import re +import unicodedata +from collections import defaultdict + +from pathlib import Path + +from iranlowo.preprocessing import strip_accents_text + + +def is_file_nfc(path): + """ + + Args: + path: File path + + Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct + + """ + text = open(path).read() + return is_text_nfc(text) + + +def is_text_nfc(text): + """Validate unicode form of given text""" + nfc_text = "".join(c for c in unicodedata.normalize("NFC", text)) + if nfc_text == text: + return True + else: + return False + + +def string_to_path(string): + return Path(string) + + +def file_info(filename): + """File metadata useful for various ADR tasks""" + + print("\nFilename: " + filename) + print("---------------------------------") + + lines = tuple(open(filename, "r", encoding="utf-8")) + num_utts = len(lines) + + text = "".join( + c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) + ) + words = re.findall("\w+", text) + num_words = len(words) + num_chars = len(re.findall(r"\S", text)) + + unique_chars = set(text) + num_uniq_chars = len(unique_chars) + + print(sorted(unique_chars)) + print("# utts : " + str(num_utts)) + print("# chars : " + str(num_chars)) + print("# uniq chars: " + str(num_uniq_chars)) + + # unaccented word stats + unaccented_words = 0 + for word in words: + if word == strip_accents_text(word): + unaccented_words += 1 + + print("# total words: " + str(num_words)) + print("# unaccented words : " + str(unaccented_words)) + print("-----------------------------------------------") + + # ambiguous word stats + ambiguity_map = defaultdict(set) + for word in words: + no_accents = strip_accents_text(word) + ambiguity_map[no_accents].add(word) + + ambiguous_words = 0 + ambiguous_words_2 = 0 + ambiguous_words_3 = 0 + ambiguous_words_4 = 0 + ambiguous_words_5 = 0 + ambiguous_words_6 = 0 + ambiguous_words_7 = 0 + ambiguous_words_8 = 0 + ambiguous_words_9 = 0 + + # fill ambiguity map + for word in ambiguity_map: + if len(ambiguity_map[word]) > 1: + ambiguous_words += 1 + if len(ambiguity_map[word]) == 2: + ambiguous_words_2 += 1 + elif len(ambiguity_map[word]) == 3: + ambiguous_words_3 += 1 + elif len(ambiguity_map[word]) == 4: + ambiguous_words_4 += 1 + elif len(ambiguity_map[word]) == 5: + ambiguous_words_5 += 1 + elif len(ambiguity_map[word]) == 6: + ambiguous_words_6 += 1 + elif len(ambiguity_map[word]) == 7: + ambiguous_words_7 += 1 + elif len(ambiguity_map[word]) == 8: + ambiguous_words_8 += 1 + elif len(ambiguity_map[word]) == 9: + ambiguous_words_9 += 1 + + # print ambiguity map + for word in ambiguity_map: + if len(ambiguity_map[word]) == 2: + print("# 2: " + str(ambiguity_map[word])) + if len(ambiguity_map[word]) == 3: + print("# 3: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 4: + print("# 4: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 5: + print("# 5: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 6: + print("# 6: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 7: + print("# 7: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 8: + print("# 8: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 9: + print("# 9: " + str(ambiguity_map[word])) + + print("# unique ambiguous words : " + str(ambiguous_words)) + print("# total unique non-diacritized words : " + str(len(ambiguity_map))) + + unique_all_words = set() + for word in words: + unique_all_words.add(word) + + print("# total unique words : " + str(len(unique_all_words))) + print("-----------------------------------------------") + print("# ambiguous 2 words : " + str(ambiguous_words_2)) + print("# ambiguous 3 words : " + str(ambiguous_words_3)) + print("# ambiguous 4 words : " + str(ambiguous_words_4)) + print("# ambiguous 5 words : " + str(ambiguous_words_5)) + print("# ambiguous 6 words : " + str(ambiguous_words_6)) + print("# ambiguous 7 words : " + str(ambiguous_words_7)) + print("# ambiguous 8 words : " + str(ambiguous_words_8)) + print("# ambiguous 9 words : " + str(ambiguous_words_9)) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pred.txt b/tests/pred.txt new file mode 100644 index 0000000..fa84c75 --- /dev/null +++ b/tests/pred.txt @@ -0,0 +1 @@ +ṣùgbọ́n diff --git a/tests/test_adr.py b/tests/test_adr.py index 7dc30de..7c9c646 100644 --- a/tests/test_adr.py +++ b/tests/test_adr.py @@ -2,126 +2,122 @@ import filecmp import iranlowo.adr as ránlọ -import os +from iranlowo import utils +from iranlowo import preprocessing +import unittest + +from tests.utils import datapath -def test_strip_accents_text(): - ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889" - yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà" - yo_1 = "Kí ó tó di ààrẹ" +class IranlowoADRTest(unittest.TestCase): + + def test_strip_accents_text(self): + ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889" + yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà" + yo_1 = "Kí ó tó di ààrẹ" - assert ránlọ.strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889" - assert ránlọ.strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika" - assert ránlọ.strip_accents_text(yo_1) == "Ki o to di aare" + self.assertEqual(utils.strip_accents_text(ca_fr), "Montreal, uber, 12.89, Mere, Francoise, noel, 889") + self.assertEqual(utils.strip_accents_text(yo_0), "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika") + self.assertEqual(utils.strip_accents_text(yo_1), "Ki o to di aare") + def test_strip_accents_file(self): + src_filepath = datapath('src_file.txt') + reference_stripped_filepath = datapath('ref_proccessed_file.txt') + processed_stripped_filepath = datapath('processed_file.txt') -def test_strip_accents_file(): - cwd = os.getcwd() - src_filepath = cwd + "/tests/testdata/src_file.txt" - reference_stripped_filepath = cwd + "/tests/testdata/ref_proccessed_file.txt" - processed_stripped_filepath = cwd + "/tests/testdata/processed_file.txt" + self.assertTrue(preprocessing.strip_accents_file(src_filepath, processed_stripped_filepath)) + self.assertFalse(filecmp.cmp(src_filepath, processed_stripped_filepath)) + self.assertTrue(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath)) - assert(ránlọ.strip_accents_file(src_filepath, processed_stripped_filepath) is True) # job completed - assert(filecmp.cmp(src_filepath, processed_stripped_filepath) is False) # src & processed are different - assert(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath)) # processed matches reference + def test_is_file_nfc(self): + src_filepath_pass = datapath('nfc.txt') + src_filepath_fail = datapath('nfc_fail.txt') + self.assertTrue(utils.is_file_nfc(src_filepath_pass)) + self.assertFalse(utils.is_file_nfc(src_filepath_fail)) -def test_is_file_nfc(): - cwd = os.getcwd() - src_filepath_pass = cwd + "/tests/testdata/nfc.txt" - src_filepath_fail = cwd + "/tests/testdata/nfc_fail.txt" - assert (ránlọ.is_file_nfc(src_filepath_pass) is True) - assert (ránlọ.is_file_nfc(src_filepath_fail) is False) + def test_is_text_nfc(self): + self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) + self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) + self.assertTrue(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è')) + self.assertFalse(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è')) -def test_is_text_nfc(): - assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False) # NFD - assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True) # NFC - - # cover diacritics that have both accents and underdots - assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False) # NFD - assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True) # NFC + def test_normalize_diacritics_file(self): + nfd_filepath = datapath('nfd.txt') + reference_nfc_filepath = datapath('nfc.txt') + processed_nfc_filepath = datapath('processed_nfc.txt') + self.assertTrue(preprocessing.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath)) + self.assertFalse(filecmp.cmp(nfd_filepath, processed_nfc_filepath)) # src & processed are different + self.assertTrue(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath)) # processed matches reference -def test_normalize_diacritics_file(): - cwd = os.getcwd() - nfd_filepath = cwd + "/tests/testdata/nfd.txt" - reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt" - processed_nfc_filepath = cwd + "/tests/testdata/processed_nfc.txt" + def test_file_info(self): + reference_nfc_filepath = datapath('nfc.txt') + utils.file_info(reference_nfc_filepath) - assert(ránlọ.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True) # job completed - assert(filecmp.cmp(nfd_filepath, processed_nfc_filepath) is False) # src & processed are different - assert(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath) is True) # processed matches reference + # reference_nfc_filepath + # def test_split_corpus_on_symbol(self): + # cwd = os.getcwd() + # multiline_filepath = "/tests/testdata/multiline.txt" + # reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt" + # processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt" + # + # assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath, + # reference_multiline_split_filepath, ',') is True) # job completed + # assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False) # src & processed are different + # assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True) # processed matches reference + # + # # try different punctuation ',', ':', etc? -def test_file_info(): - cwd = os.getcwd() - reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt" - ránlọ.file_info(reference_nfc_filepath) + def test_diacritize_text(self): + predictions = ránlọ.diacritize_text("leyin igba naa") + self.assertEqual(predictions, "lẹ́yìn ìgbà náà") # generated matches reference + self.assertNotEqual(predictions, "lẹ́yìn igbà náà") # generated does not match incorrect reference - # reference_nfc_filepath + predictions = ránlọ.diacritize_text("obinrin") + self.assertEqual(predictions, "obìnrin") # generated matches reference + self.assertNotEqual(predictions, "obinrin") # generated does not match incorrect reference -# def test_split_corpus_on_symbol(): -# cwd = os.getcwd() -# multiline_filepath = "/tests/testdata/multiline.txt" -# reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt" -# processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt" -# -# assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath, -# reference_multiline_split_filepath, ',') is True) # job completed -# assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False) # src & processed are different -# assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True) # processed matches reference -# -# # try different punctuation ',', ':', etc? + predictions = ránlọ.diacritize_text("okunrin") + self.assertEqual(predictions, "ọkùnrin") # generated matches reference + self.assertNotEqual(predictions, "ọkunrin") # generated does not match incorrect reference + predictions = ránlọ.diacritize_text("orisirisi") + self.assertEqual(predictions, "oríṣiríṣi") # generated matches reference + self.assertNotEqual(predictions, "orísiríṣi") # generated does not match incorrect reference -def test_diacritize_text(): + predictions = ránlọ.diacritize_text("nitori naa") + self.assertEqual(predictions, "nítorí náà") # generated matches reference + self.assertNotEqual(predictions, "nitorí náà") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("leyin igba naa") - assert(predictions == "lẹ́yìn ìgbà náà") # generated matches reference - assert(predictions != "lẹ́yìn igbà náà") # generated does not match incorrect reference + predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo") + self.assertEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́") # generated matches reference + self.assertNotEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("obinrin") - assert(predictions == "obìnrin") # generated matches reference - assert(predictions != "obinrin") # generated does not match incorrect reference + # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode") + # assert(predictions , "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated matches reference + # assert(predictions , "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("okunrin") - assert(predictions == "ọkùnrin") # generated matches reference - assert(predictions != "ọkunrin") # generated does not match incorrect reference + predictions = ránlọ.diacritize_text("ati beebee lo") + self.assertEqual(predictions, "àti bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated matches reference + self.assertNotEqual(predictions, "ati bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("orisirisi") - assert(predictions == "oríṣiríṣi") # generated matches reference - assert(predictions != "orísiríṣi") # generated does not match incorrect reference + predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari") + self.assertEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí") # generated matches reference + self.assertNotEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("nitori naa") - assert(predictions == "nítorí náà") # generated matches reference - assert(predictions != "nitorí náà") # generated does not match incorrect reference + # predictions = ránlọ.diacritize_text("bi ase nlo yii") + # assert(predictions , "bí aṣe ńlọ yìí") # generated matches reference + # assert(predictions , "bí ase ńlọ yìí") # generated does not match incorrect reference + + predictions = ránlọ.diacritize_text("o dabi pe") + self.assertEqual(predictions, "ó dàbí pé") # generated matches reference + self.assertNotEqual(predictions, "ó dàbí pe") # generated does not match incorrect reference + + predictions = ránlọ.diacritize_text("sugbon") + self.assertEqual(predictions, "ṣùgbọ́n") # generated matches reference + self.assertNotEqual(predictions, "ṣugbọ́n") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo") - assert(predictions == "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́") # generated matches reference - assert(predictions != "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́") # generated does not match incorrect reference - - # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode") - # assert(predictions == "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated matches reference - # assert(predictions != "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated does not match incorrect reference - - predictions = ránlọ.diacritize_text("ati beebee lo") - assert(predictions == "àti bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated matches reference - assert(predictions != "ati bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated does not match incorrect reference - - predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari") - assert(predictions == "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí") # generated matches reference - assert(predictions != "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí") # generated does not match incorrect reference - - # predictions = ránlọ.diacritize_text("bi ase nlo yii") - # assert(predictions == "bí aṣe ńlọ yìí") # generated matches reference - # assert(predictions != "bí ase ńlọ yìí") # generated does not match incorrect reference - - predictions = ránlọ.diacritize_text("o dabi pe") - assert(predictions == "ó dàbí pé") # generated matches reference - assert(predictions != "ó dàbí pe") # generated does not match incorrect reference - - predictions = ránlọ.diacritize_text("sugbon") - assert(predictions == "ṣùgbọ́n") # generated matches reference - assert(predictions != "ṣugbọ́n") # generated does not match incorrect reference diff --git a/tests/test_corpus.py b/tests/test_corpus.py new file mode 100644 index 0000000..4ebc36e --- /dev/null +++ b/tests/test_corpus.py @@ -0,0 +1,69 @@ +import string +import unittest +from pathlib import Path + +from iranlowo import corpus +from tests.utils import datapath + + +class TestTextCorpus(unittest.TestCase): + def setUp(self): + self.corpus_class = corpus.Corpus + self.directory_loader = corpus.DirectoryCorpus + self.txt_extension = 'txt' + self.csv_extension = 'csv' + self.gzip_extension = 'gzip' + + def test_load_corpus_from_path(self): + path = datapath('owe_pass') + corpus = self.corpus_class(path=path, fformat=self.txt_extension) + self.assertEqual(len(corpus), 420) + + def test_load_corpus_from_path_stream(self): + path = datapath('owe_pass') + corpus = self.corpus_class(path=path, fformat=self.txt_extension, stream=True) + self.assertEqual(len(corpus), 420) + + def test_load_corpus_from_text(self): + text = open(datapath('owe_pass')).read() + corpus = self.corpus_class(text=text) + self.assertEqual(len(corpus), 420) + + def test_load_corpus_with_preprocessing(self): + lines = [ + "Àwọn obìnrin, wọn ní kiní agbára yẹn lórí àwọn ọkùnrin?", + "Ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?", + "eranko wo lo buru julo" + ] + expected = [ + 'Àwọn obìnrin wọn ní kiní agbára yẹn lórí àwọn ọkùnrin', + "ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?", + 'erankowoloburujulo' + ] + + def punctuations(text): return text.translate(str.maketrans('', '', string.punctuation)) + + preprocessing = [ + lambda x: punctuations(x), lambda x: x.lower(), lambda x: x.replace(' ', '') + ] + + for index, entry in enumerate(lines): + corpus = self.corpus_class(text=entry, preprocess=preprocessing[index]) + self.assertEqual(corpus.data, expected[index]) + + def test_load_corpus_from_directory(self): + direc = datapath('dirdata') + invalid_dir = datapath('test_data') + multi_dir = datapath() + path = Path(direc).glob('*') + dir_corpus = self.directory_loader(path=direc) + self.assertEqual(len(dir_corpus.data), len(list(path))) + with self.assertRaises(NotADirectoryError): + self.directory_loader(path=invalid_dir) + multi_corp = self.directory_loader(path=multi_dir) + multi_path = Path(multi_dir).glob('**/*') + self.assertEqual(len(multi_corp.data), len(list(multi_path))-1) + + def test_save(self): + pass + diff --git a/tests/test_loaders.py b/tests/test_loaders.py new file mode 100644 index 0000000..7468a35 --- /dev/null +++ b/tests/test_loaders.py @@ -0,0 +1,12 @@ +import unittest + +from iranlowo import corpus + + +class TestCoprusLoader(unittest.TestCase): + def setUp(self): + self.owe_loader = corpus.OweLoader + + def test_load_owe(self): + with self.assertRaises(NotADirectoryError): + self.owe_loader() diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 3203e18..84dc379 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,10 +1,11 @@ -import os +import unittest from iranlowo import preprocessing +from tests.utils import datapath -def test_is_valid_owe_format(): - cwd = os.getcwd() - fail_path = cwd + "/tests/testdata/nfc.txt" +class IranlowoCorpusTest(unittest.TestCase): - assert preprocessing.is_valid_owé_format(fail_path) is False + def test_is_valid_owe_format(self): + fail_path = datapath('nfc.txt') + self.assertFalse(preprocessing.is_valid_owé_format(fail_path)) diff --git a/tests/testdata/dirdata/yo_000.txt b/tests/testdata/dirdata/yo_000.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/testdata/dirdata/yo_000.txt @@ -0,0 +1 @@ + diff --git a/tests/testdata/dirdata/yo_001.txt b/tests/testdata/dirdata/yo_001.txt new file mode 100644 index 0000000..ab97739 --- /dev/null +++ b/tests/testdata/dirdata/yo_001.txt @@ -0,0 +1 @@ +A di gàárì sílẹ̀ ewúrẹ́ ńyọjú; ẹrù ìran rẹ̀ ni? diff --git a/tests/testdata/dirdata/yo_002.txt b/tests/testdata/dirdata/yo_002.txt new file mode 100644 index 0000000..87d7002 --- /dev/null +++ b/tests/testdata/dirdata/yo_002.txt @@ -0,0 +1 @@ +A fi ọ́ jọba ò ńṣàwúre o fẹ́ jẹ Ọlọ́run ni? diff --git a/tests/testdata/dirdata/yo_003.txt b/tests/testdata/dirdata/yo_003.txt new file mode 100644 index 0000000..822cd91 --- /dev/null +++ b/tests/testdata/dirdata/yo_003.txt @@ -0,0 +1 @@ +A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, bí a ò bá jà, bí a bá ti gba Awà, kò tán bí? diff --git a/tests/testdata/dirdata/yo_004.txt b/tests/testdata/dirdata/yo_004.txt new file mode 100644 index 0000000..c08f5e6 --- /dev/null +++ b/tests/testdata/dirdata/yo_004.txt @@ -0,0 +1 @@ +A gbé gàárì ọmọ ewurẹ ńrojú; kì í ṣe ẹrù àgùntàn. diff --git a/tests/testdata/dirdata/yo_005.txt b/tests/testdata/dirdata/yo_005.txt new file mode 100644 index 0000000..19a221e --- /dev/null +++ b/tests/testdata/dirdata/yo_005.txt @@ -0,0 +1 @@ +A kì í bá ọba pàlà kí ọkọ́ ọba má ṣàn-ánni lẹ́sẹ̀. diff --git a/tests/testdata/dirdata/yo_006.txt b/tests/testdata/dirdata/yo_006.txt new file mode 100644 index 0000000..35113f5 --- /dev/null +++ b/tests/testdata/dirdata/yo_006.txt @@ -0,0 +1 @@ +A kì í bínú ààtàn ká dalẹ̀ sígbẹ̀ẹ́. diff --git a/tests/testdata/dirdata/yo_007.txt b/tests/testdata/dirdata/yo_007.txt new file mode 100644 index 0000000..695bae3 --- /dev/null +++ b/tests/testdata/dirdata/yo_007.txt @@ -0,0 +1 @@ +A kì í bínú orí ká fi fìlà dé ìbàdí. diff --git a/tests/testdata/dirdata/yo_008.txt b/tests/testdata/dirdata/yo_008.txt new file mode 100644 index 0000000..5123008 --- /dev/null +++ b/tests/testdata/dirdata/yo_008.txt @@ -0,0 +1 @@ +A kì í bẹ̀rù ikú bẹ̀rù àrùn ká ní kí ọmọ ó kú sinni. diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..8f8ed14 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,9 @@ +import os + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder + + +def datapath(fname=None): + if not fname: + return os.path.join(module_path, 'testdata') + return os.path.join(module_path, 'testdata', fname)