From 23e4594fc141f0c284c14d6129c14f62d799428e Mon Sep 17 00:00:00 2001 From: Olamyy Date: Fri, 12 Jul 2019 16:00:10 +0100 Subject: [PATCH 1/6] Introduced a corpus module --- src/iranlowo/adr.py | 271 -------------------------------- src/iranlowo/corpus/__init__.py | 0 src/iranlowo/corpus/corpus.py | 151 ++++++++++++++++++ src/iranlowo/preprocessing.py | 128 +++++++++++++++ src/iranlowo/utils.py | 136 ++++++++++++++++ tests/test_adr.py | 56 ++++--- tests/test_corpus.py | 0 7 files changed, 445 insertions(+), 297 deletions(-) create mode 100644 src/iranlowo/corpus/__init__.py create mode 100644 src/iranlowo/corpus/corpus.py create mode 100644 src/iranlowo/utils.py create mode 100644 tests/test_corpus.py diff --git a/src/iranlowo/adr.py b/src/iranlowo/adr.py index 1012213..e57e730 100644 --- a/src/iranlowo/adr.py +++ b/src/iranlowo/adr.py @@ -4,272 +4,11 @@ from __future__ import unicode_literals import pkg_resources -import re -import unicodedata - from argparse import Namespace -from collections import defaultdict from onmt.translate.translator import build_translator from onmt.utils.parse import ArgumentParser -def strip_accents_text(text_string): - """ - Converts the string to NFD, separates & returns only the base characters - :param text_string: - :return: input string without diacritic adornments on base characters - """ - return "".join( - c - for c in unicodedata.normalize("NFD", text_string) - if unicodedata.category(c) != "Mn" - ) - - -def strip_accents_file(filename, outfilename): - """ - Reads filename containing diacritics, converts to NFC for consistency, - then writes outfilename with diacritics removed - :param filename: - :param outfilename: - :return: None - """ - text = "".join( - c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) - ) - try: - f = open(outfilename, "w") - except EnvironmentError: - return False - else: - with f: - f.write(strip_accents_text(text)) - return True - - -def is_file_nfc(path): - """ - - Args: - path: File path - - Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct - - """ - text = open(path).read() - return is_text_nfc(text) - - -def is_text_nfc(text): - """Validate unicode form of given text""" - nfc_text = "".join(c for c in unicodedata.normalize("NFC", text)) - if nfc_text == text: - return True - else: - return False - - -def normalize_diacritics_text(text_string): - """Convenience wrapper to abstract away unicode & NFC""" - return unicodedata.normalize("NFC", text_string) - - -def normalize_diacritics_file(filename, outfilename): - """File based Convenience wrapper to abstract away unicode & NFC""" - try: - text = "".join( - c - for c in unicodedata.normalize( - "NFC", open(filename, encoding="utf-8").read() - ) - ) - with open(outfilename, "w", encoding="utf-8") as f: - f.write(text) - except EnvironmentError: - return False - else: - return True - - -def file_info(filename): - """File metadata useful for various ADR tasks""" - - print("\nFilename: " + filename) - print("---------------------------------") - - lines = tuple(open(filename, "r", encoding="utf-8")) - num_utts = len(lines) - - text = "".join( - c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) - ) - words = re.findall("\w+", text) - num_words = len(words) - num_chars = len(re.findall(r"\S", text)) - - unique_chars = set(text) - num_uniq_chars = len(unique_chars) - - print(sorted(unique_chars)) - print("# utts : " + str(num_utts)) - print("# chars : " + str(num_chars)) - print("# uniq chars: " + str(num_uniq_chars)) - - # unaccented word stats - unaccented_words = 0 - for word in words: - if word == strip_accents_text(word): - unaccented_words += 1 - - print("# total words: " + str(num_words)) - print("# unaccented words : " + str(unaccented_words)) - print("-----------------------------------------------") - - # ambiguous word stats - ambiguity_map = defaultdict(set) - for word in words: - no_accents = strip_accents_text(word) - ambiguity_map[no_accents].add(word) - - ambiguous_words = 0 - ambiguous_words_2 = 0 - ambiguous_words_3 = 0 - ambiguous_words_4 = 0 - ambiguous_words_5 = 0 - ambiguous_words_6 = 0 - ambiguous_words_7 = 0 - ambiguous_words_8 = 0 - ambiguous_words_9 = 0 - - # fill ambiguity map - for word in ambiguity_map: - if len(ambiguity_map[word]) > 1: - ambiguous_words += 1 - if len(ambiguity_map[word]) == 2: - ambiguous_words_2 += 1 - elif len(ambiguity_map[word]) == 3: - ambiguous_words_3 += 1 - elif len(ambiguity_map[word]) == 4: - ambiguous_words_4 += 1 - elif len(ambiguity_map[word]) == 5: - ambiguous_words_5 += 1 - elif len(ambiguity_map[word]) == 6: - ambiguous_words_6 += 1 - elif len(ambiguity_map[word]) == 7: - ambiguous_words_7 += 1 - elif len(ambiguity_map[word]) == 8: - ambiguous_words_8 += 1 - elif len(ambiguity_map[word]) == 9: - ambiguous_words_9 += 1 - - # print ambiguity map - for word in ambiguity_map: - if len(ambiguity_map[word]) == 2: - print("# 2: " + str(ambiguity_map[word])) - if len(ambiguity_map[word]) == 3: - print("# 3: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 4: - print("# 4: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 5: - print("# 5: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 6: - print("# 6: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 7: - print("# 7: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 8: - print("# 8: " + str(ambiguity_map[word])) - elif len(ambiguity_map[word]) == 9: - print("# 9: " + str(ambiguity_map[word])) - - print("# unique ambiguous words : " + str(ambiguous_words)) - print("# total unique non-diacritized words : " + str(len(ambiguity_map))) - - unique_all_words = set() - for word in words: - unique_all_words.add(word) - - print("# total unique words : " + str(len(unique_all_words))) - print("-----------------------------------------------") - print("# ambiguous 2 words : " + str(ambiguous_words_2)) - print("# ambiguous 3 words : " + str(ambiguous_words_3)) - print("# ambiguous 4 words : " + str(ambiguous_words_4)) - print("# ambiguous 5 words : " + str(ambiguous_words_5)) - print("# ambiguous 6 words : " + str(ambiguous_words_6)) - print("# ambiguous 7 words : " + str(ambiguous_words_7)) - print("# ambiguous 8 words : " + str(ambiguous_words_8)) - print("# ambiguous 9 words : " + str(ambiguous_words_9)) - - -def split_corpus_on_symbol(filename, outfilename, symbol=","): - """ - For yoruba blog (and probably bibeli mimo) - - Args: filenames for I/O and symbol to split lines on - Returns: writes outputfile - :param filename: input file - :param outfilename: processed output file to write - :param symbol: to split lines on - :return: None, with side-effect of writing an outputfile - """ - - lines = tuple(open(filename, "r", encoding="utf-8")) - - min_words_to_split = 10 - min_words_in_utt = 5 - - with open(outfilename, "w") as f: - # split out heavily comma'd text :(( - for line in lines: - if symbol in line: - num_words = len(line.split()) - num_commas = line.count(symbol) - curr_comma_position = line.index(symbol) - num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split()) - - curr_line = line - while num_commas > 0: - if num_words < min_words_to_split: - # print(curr_line.strip()) - f.write(curr_line) - break - if num_words >= min_words_to_split: - if ( - num_words_ahead_of_curr_comma >= min_words_in_utt - and len((curr_line)[curr_comma_position:].split()) - >= min_words_in_utt - ): - f.write((curr_line)[0:curr_comma_position] + "\n") - - # update vars - curr_line = curr_line[curr_comma_position + 1:] - num_words = len(curr_line.split()) - num_commas = num_commas - 1 - if num_commas > 0: - curr_comma_position = curr_line.index(symbol) - num_words_ahead_of_curr_comma = len( - curr_line[0:curr_comma_position].split() - ) - else: - f.write(curr_line) - else: - # ignore too short comma (+= vs = on current comma position) - num_commas = num_commas - 1 - if num_commas > 0: # for say 3 commas - curr_comma_position += ( - curr_line[curr_comma_position + 1:].index(symbol) - + 1 - ) - num_words_ahead_of_curr_comma = len( - curr_line[0:curr_comma_position].split() - ) - else: - f.write(curr_line) - else: - f.write(curr_line) - else: - f.write(line) - - def diacritize_text(undiacritized_text, verbose=False): # manually construct the options so we don't have to pass them in. opt = Namespace() @@ -339,13 +78,3 @@ def diacritize_text(undiacritized_text, verbose=False): ) return prediction[0][0] - -if __name__ == "__main__": - # # test - print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) # NFD - print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) # NFC - print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Book_of_Mormon/cleaned/doctrine_and_covenants.txt')) - - print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Owe/yoruba_proverbs_out.txt')) - - # file_info("../../tests/testdata/nfc.txt") diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py new file mode 100644 index 0000000..cb47cc5 --- /dev/null +++ b/src/iranlowo/corpus/corpus.py @@ -0,0 +1,151 @@ +import gzip +import os +import sys + +import requests + +from gensim import interfaces +from gensim.corpora.csvcorpus import CsvCorpus +from gensim.corpora.textcorpus import lower_to_unicode, strip_multiple_whitespaces, walk +from gensim.utils import deaccent + +from iranlowo.preprocessing import is_valid_owé_format +from iranlowo.utils import is_text_nfc + + +class Corpus(interfaces.CorpusABC): + def __init__(self, path=None, text=None, is_url=False, rlist=False, stream=False, fformat='txt', cformat=None, labels=False, preprocess=False): + """ + + Args: + path: + text: + **kwargs: + """ + self.path = path + self.text = text + self.rlist = rlist + self.labels = labels + self.stream = stream + self.fformat = fformat + self.preprocess = preprocess + self.cformat = cformat + self.is_url = is_url + self.data = text if text else self.read_file_or_filename() + self.validate_format() + + def __iter__(self): + for line in self.data: + yield line + + def __len__(self): + return len(self.data) + + def get_data(self): + pass + + @staticmethod + def save_corpus(fname, corpus, id2word=None, metadata=False): + pass + + def streamfile(self, fobj): + num_text = 0 + with fobj as obj: + for line in obj: + num_text += 1 + yield line + + def read_file_or_filename(self, f=None): + """ + + Returns: + + """ + path = f if f else self.path + text = None + print(len(self.path)) + out = [] + if isinstance(path, list): + for f in path: + path.remove(f) + sys.setrecursionlimit(10000) + text = self.read_file_or_filename(f) + out.append(text) + else: + if self.is_url: + r = requests.get(path) + if r.status_code in [200, 201]: + text = r.text + return text + elif isinstance(path, str): + if self.fformat == "txt": + text = open(path) + elif self.fformat == "csv": + text = CsvCorpus(path, self.labels) + elif self.fformat == 'gzip': + text = gzip.open(path) + else: + text = self.path.seek(0) + + if not self.stream: + text = text.read() if not self.rlist else text.readlines() + print(text) + if self.preprocess: + text = self.handle_preprocessing(text) + return text + else: + self.streamfile(text) + + def handle_preprocessing(self, text): + if callable(self.preprocess): + return self.preprocess(text) + if isinstance(self.preprocess, list): + prep_list = self.preprocess if isinstance(self.preprocess, list) else [lower_to_unicode, deaccent, strip_multiple_whitespaces] + for technique in prep_list: + text = technique(self.data) + return text + + def validate_format(self): + """ + + Returns: + + """ + data = self.data + if isinstance(data, list): + data = ' '.join(data) + if not self.cformat and not is_text_nfc(data): + raise TypeError("The corpus does not comply to the NFC corpus format") + elif self.cformat == "owe": + if not is_valid_owé_format(data): + raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat)) + else: + return True + + def generate(self, size): + """ + + Args: + size: + + Returns: + + """ + if not self.cformat: + raise ValueError("You need to specify a format for generating random text") + + +class DirectoryCorpus(Corpus): + def __init__(self, path, **kwargs): + self.path_dir = path + walked = list(walk(self.path_dir)) + self.depth = walked[0][0] + self.dirnames = walked[0][2] + self.flist = walked[0][3] + self.path = list(self.read_files()) + super(DirectoryCorpus, self).__init__(path=self.path, **kwargs) + + def read_files(self): + for path in self.flist: + yield os.path.join(self.path_dir, path) + diff --git a/src/iranlowo/preprocessing.py b/src/iranlowo/preprocessing.py index fe72ee7..73545fe 100644 --- a/src/iranlowo/preprocessing.py +++ b/src/iranlowo/preprocessing.py @@ -1,5 +1,6 @@ import csv import gzip +import unicodedata from pathlib import Path @@ -90,3 +91,130 @@ def get_chunk(txt, n): except IndexError: pass # End of file reached + +def strip_accents_text(text_string): + """ + Converts the string to NFD, separates & returns only the base characters + :param text_string: + :return: input string without diacritic adornments on base characters + """ + return "".join( + c + for c in unicodedata.normalize("NFD", text_string) + if unicodedata.category(c) != "Mn" + ) + + +def strip_accents_file(filename, outfilename): + """ + Reads filename containing diacritics, converts to NFC for consistency, + then writes outfilename with diacritics removed + :param filename: + :param outfilename: + :return: None + """ + text = "".join( + c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) + ) + try: + f = open(outfilename, "w") + except EnvironmentError: + return False + else: + with f: + f.write(strip_accents_text(text)) + return True + + +def normalize_diacritics_text(text_string): + """Convenience wrapper to abstract away unicode & NFC""" + return unicodedata.normalize("NFC", text_string) + + +def normalize_diacritics_file(filename, outfilename): + """File based Convenience wrapper to abstract away unicode & NFC""" + try: + text = "".join( + c + for c in unicodedata.normalize( + "NFC", open(filename, encoding="utf-8").read() + ) + ) + with open(outfilename, "w", encoding="utf-8") as f: + f.write(text) + except EnvironmentError: + return False + else: + return True + + +def split_corpus_on_symbol(filename, outfilename, symbol=","): + """ + For yoruba blog (and probably bibeli mimo) + + Args: filenames for I/O and symbol to split lines on + Returns: writes outputfile + :param filename: input file + :param outfilename: processed output file to write + :param symbol: to split lines on + :return: None, with side-effect of writing an outputfile + """ + + lines = tuple(open(filename, "r", encoding="utf-8")) + + min_words_to_split = 10 + min_words_in_utt = 5 + + with open(outfilename, "w") as f: + # split out heavily comma'd text :(( + for line in lines: + if symbol in line: + num_words = len(line.split()) + num_commas = line.count(symbol) + curr_comma_position = line.index(symbol) + num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split()) + + curr_line = line + while num_commas > 0: + if num_words < min_words_to_split: + # print(curr_line.strip()) + f.write(curr_line) + break + if num_words >= min_words_to_split: + if ( + num_words_ahead_of_curr_comma >= min_words_in_utt + and len(curr_line[curr_comma_position:].split()) + >= min_words_in_utt + ): + f.write(curr_line[0:curr_comma_position] + "\n") + + # update vars + curr_line = curr_line[curr_comma_position + 1:] + num_words = len(curr_line.split()) + num_commas = num_commas - 1 + if num_commas > 0: + curr_comma_position = curr_line.index(symbol) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + f.write(curr_line) + else: + # ignore too short comma (+= vs = on current comma position) + num_commas = num_commas - 1 + if num_commas > 0: # for say 3 commas + curr_comma_position += ( + curr_line[curr_comma_position + 1:].index(symbol) + + 1 + ) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + f.write(curr_line) + else: + f.write(curr_line) + else: + f.write(line) + + diff --git a/src/iranlowo/utils.py b/src/iranlowo/utils.py new file mode 100644 index 0000000..a861d32 --- /dev/null +++ b/src/iranlowo/utils.py @@ -0,0 +1,136 @@ +import re +import unicodedata +from collections import defaultdict + +from iranlowo.preprocessing import strip_accents_text + + +def is_file_nfc(path): + """ + + Args: + path: File path + + Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct + + """ + text = open(path).read() + return is_text_nfc(text) + + +def is_text_nfc(text): + """Validate unicode form of given text""" + nfc_text = "".join(c for c in unicodedata.normalize("NFC", text)) + if nfc_text == text: + return True + else: + return False + + +def file_info(filename): + """File metadata useful for various ADR tasks""" + + print("\nFilename: " + filename) + print("---------------------------------") + + lines = tuple(open(filename, "r", encoding="utf-8")) + num_utts = len(lines) + + text = "".join( + c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read()) + ) + words = re.findall("\w+", text) + num_words = len(words) + num_chars = len(re.findall(r"\S", text)) + + unique_chars = set(text) + num_uniq_chars = len(unique_chars) + + print(sorted(unique_chars)) + print("# utts : " + str(num_utts)) + print("# chars : " + str(num_chars)) + print("# uniq chars: " + str(num_uniq_chars)) + + # unaccented word stats + unaccented_words = 0 + for word in words: + if word == strip_accents_text(word): + unaccented_words += 1 + + print("# total words: " + str(num_words)) + print("# unaccented words : " + str(unaccented_words)) + print("-----------------------------------------------") + + # ambiguous word stats + ambiguity_map = defaultdict(set) + for word in words: + no_accents = strip_accents_text(word) + ambiguity_map[no_accents].add(word) + + ambiguous_words = 0 + ambiguous_words_2 = 0 + ambiguous_words_3 = 0 + ambiguous_words_4 = 0 + ambiguous_words_5 = 0 + ambiguous_words_6 = 0 + ambiguous_words_7 = 0 + ambiguous_words_8 = 0 + ambiguous_words_9 = 0 + + # fill ambiguity map + for word in ambiguity_map: + if len(ambiguity_map[word]) > 1: + ambiguous_words += 1 + if len(ambiguity_map[word]) == 2: + ambiguous_words_2 += 1 + elif len(ambiguity_map[word]) == 3: + ambiguous_words_3 += 1 + elif len(ambiguity_map[word]) == 4: + ambiguous_words_4 += 1 + elif len(ambiguity_map[word]) == 5: + ambiguous_words_5 += 1 + elif len(ambiguity_map[word]) == 6: + ambiguous_words_6 += 1 + elif len(ambiguity_map[word]) == 7: + ambiguous_words_7 += 1 + elif len(ambiguity_map[word]) == 8: + ambiguous_words_8 += 1 + elif len(ambiguity_map[word]) == 9: + ambiguous_words_9 += 1 + + # print ambiguity map + for word in ambiguity_map: + if len(ambiguity_map[word]) == 2: + print("# 2: " + str(ambiguity_map[word])) + if len(ambiguity_map[word]) == 3: + print("# 3: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 4: + print("# 4: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 5: + print("# 5: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 6: + print("# 6: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 7: + print("# 7: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 8: + print("# 8: " + str(ambiguity_map[word])) + elif len(ambiguity_map[word]) == 9: + print("# 9: " + str(ambiguity_map[word])) + + print("# unique ambiguous words : " + str(ambiguous_words)) + print("# total unique non-diacritized words : " + str(len(ambiguity_map))) + + unique_all_words = set() + for word in words: + unique_all_words.add(word) + + print("# total unique words : " + str(len(unique_all_words))) + print("-----------------------------------------------") + print("# ambiguous 2 words : " + str(ambiguous_words_2)) + print("# ambiguous 3 words : " + str(ambiguous_words_3)) + print("# ambiguous 4 words : " + str(ambiguous_words_4)) + print("# ambiguous 5 words : " + str(ambiguous_words_5)) + print("# ambiguous 6 words : " + str(ambiguous_words_6)) + print("# ambiguous 7 words : " + str(ambiguous_words_7)) + print("# ambiguous 8 words : " + str(ambiguous_words_8)) + print("# ambiguous 9 words : " + str(ambiguous_words_9)) diff --git a/tests/test_adr.py b/tests/test_adr.py index 7dc30de..8bd618d 100644 --- a/tests/test_adr.py +++ b/tests/test_adr.py @@ -1,18 +1,22 @@ # -*- coding: utf-8 -*- import filecmp -import iranlowo.adr as ránlọ import os +from iranlowo.adr import diacritize_text + +from iranlowo.preprocessing import strip_accents_text, strip_accents_file, normalize_diacritics_file +from iranlowo.utils import is_file_nfc, is_text_nfc, file_info + def test_strip_accents_text(): ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889" yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà" yo_1 = "Kí ó tó di ààrẹ" - assert ránlọ.strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889" - assert ránlọ.strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika" - assert ránlọ.strip_accents_text(yo_1) == "Ki o to di aare" + assert strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889" + assert strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika" + assert strip_accents_text(yo_1) == "Ki o to di aare" def test_strip_accents_file(): @@ -21,7 +25,7 @@ def test_strip_accents_file(): reference_stripped_filepath = cwd + "/tests/testdata/ref_proccessed_file.txt" processed_stripped_filepath = cwd + "/tests/testdata/processed_file.txt" - assert(ránlọ.strip_accents_file(src_filepath, processed_stripped_filepath) is True) # job completed + assert(strip_accents_file(src_filepath, processed_stripped_filepath) is True) # job completed assert(filecmp.cmp(src_filepath, processed_stripped_filepath) is False) # src & processed are different assert(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath)) # processed matches reference @@ -30,17 +34,17 @@ def test_is_file_nfc(): cwd = os.getcwd() src_filepath_pass = cwd + "/tests/testdata/nfc.txt" src_filepath_fail = cwd + "/tests/testdata/nfc_fail.txt" - assert (ránlọ.is_file_nfc(src_filepath_pass) is True) - assert (ránlọ.is_file_nfc(src_filepath_fail) is False) + assert (is_file_nfc(src_filepath_pass) is True) + assert (is_file_nfc(src_filepath_fail) is False) def test_is_text_nfc(): - assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False) # NFD - assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True) # NFC + assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False) # NFD + assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True) # NFC # cover diacritics that have both accents and underdots - assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False) # NFD - assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True) # NFC + assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False) # NFD + assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True) # NFC def test_normalize_diacritics_file(): @@ -49,7 +53,7 @@ def test_normalize_diacritics_file(): reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt" processed_nfc_filepath = cwd + "/tests/testdata/processed_nfc.txt" - assert(ránlọ.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True) # job completed + assert(normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True) # job completed assert(filecmp.cmp(nfd_filepath, processed_nfc_filepath) is False) # src & processed are different assert(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath) is True) # processed matches reference @@ -57,7 +61,7 @@ def test_normalize_diacritics_file(): def test_file_info(): cwd = os.getcwd() reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt" - ránlọ.file_info(reference_nfc_filepath) + file_info(reference_nfc_filepath) # reference_nfc_filepath @@ -67,7 +71,7 @@ def test_file_info(): # reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt" # processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt" # -# assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath, +# assert(split_out_corpus_on_symbol(multiline_filepath, # reference_multiline_split_filepath, ',') is True) # job completed # assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False) # src & processed are different # assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True) # processed matches reference @@ -77,51 +81,51 @@ def test_file_info(): def test_diacritize_text(): - predictions = ránlọ.diacritize_text("leyin igba naa") + predictions = diacritize_text("leyin igba naa") assert(predictions == "lẹ́yìn ìgbà náà") # generated matches reference assert(predictions != "lẹ́yìn igbà náà") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("obinrin") + predictions = diacritize_text("obinrin") assert(predictions == "obìnrin") # generated matches reference assert(predictions != "obinrin") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("okunrin") + predictions = diacritize_text("okunrin") assert(predictions == "ọkùnrin") # generated matches reference assert(predictions != "ọkunrin") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("orisirisi") + predictions = diacritize_text("orisirisi") assert(predictions == "oríṣiríṣi") # generated matches reference assert(predictions != "orísiríṣi") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("nitori naa") + predictions = diacritize_text("nitori naa") assert(predictions == "nítorí náà") # generated matches reference assert(predictions != "nitorí náà") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo") + predictions = diacritize_text("leyin oro mi won ko tun soro mo") assert(predictions == "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́") # generated matches reference assert(predictions != "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́") # generated does not match incorrect reference - # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode") + # predictions = diacritize_text("awon okunrin nse ise agbara bi ise ode") # assert(predictions == "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated matches reference # assert(predictions != "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("ati beebee lo") + predictions = diacritize_text("ati beebee lo") assert(predictions == "àti bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated matches reference assert(predictions != "ati bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari") + predictions = diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari") assert(predictions == "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí") # generated matches reference assert(predictions != "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí") # generated does not match incorrect reference - # predictions = ránlọ.diacritize_text("bi ase nlo yii") + # predictions = diacritize_text("bi ase nlo yii") # assert(predictions == "bí aṣe ńlọ yìí") # generated matches reference # assert(predictions != "bí ase ńlọ yìí") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("o dabi pe") + predictions = diacritize_text("o dabi pe") assert(predictions == "ó dàbí pé") # generated matches reference assert(predictions != "ó dàbí pe") # generated does not match incorrect reference - predictions = ránlọ.diacritize_text("sugbon") + predictions = diacritize_text("sugbon") assert(predictions == "ṣùgbọ́n") # generated matches reference assert(predictions != "ṣugbọ́n") # generated does not match incorrect reference diff --git a/tests/test_corpus.py b/tests/test_corpus.py new file mode 100644 index 0000000..e69de29 From 741d9093d60c1cd19c36783d8d865e0ae1b51cc4 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sat, 13 Jul 2019 11:44:36 +0100 Subject: [PATCH 2/6] OOP Testing and Corpus Module --- requirements.txt | 1 + src/iranlowo/corpus/__init__.py | 1 + src/iranlowo/corpus/bbc_yoruba.py | 13 ++ src/iranlowo/corpus/bibeli.py | 12 ++ src/iranlowo/corpus/corpus.py | 55 +++------ src/iranlowo/corpus/scrapper.py | 0 src/iranlowo/corpus/yor_blog.py | 12 ++ src/iranlowo/interfaces.py | 14 +++ tests/pred.txt | 1 + tests/test_adr.py | 198 ++++++++++++++---------------- tests/test_corpus.py | 54 ++++++++ tests/test_preprocessing.py | 11 +- tests/utils.py | 7 ++ 13 files changed, 235 insertions(+), 144 deletions(-) create mode 100644 src/iranlowo/corpus/bbc_yoruba.py create mode 100644 src/iranlowo/corpus/bibeli.py create mode 100644 src/iranlowo/corpus/scrapper.py create mode 100644 src/iranlowo/corpus/yor_blog.py create mode 100644 src/iranlowo/interfaces.py create mode 100644 tests/pred.txt create mode 100644 tests/utils.py diff --git a/requirements.txt b/requirements.txt index 138f082..cdf04af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +gensim bs4 configargparse torch diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py index e69de29..fc5a0d8 100644 --- a/src/iranlowo/corpus/__init__.py +++ b/src/iranlowo/corpus/__init__.py @@ -0,0 +1 @@ +from .corpus import Corpus, DirectoryCorpus diff --git a/src/iranlowo/corpus/bbc_yoruba.py b/src/iranlowo/corpus/bbc_yoruba.py new file mode 100644 index 0000000..004d6da --- /dev/null +++ b/src/iranlowo/corpus/bbc_yoruba.py @@ -0,0 +1,13 @@ +from iranlowo.corpus import Corpus + + +class BBCCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(BBCCorpus, self).__init__(path=self.path, **kwargs) + super().__init__(path) + diff --git a/src/iranlowo/corpus/bibeli.py b/src/iranlowo/corpus/bibeli.py new file mode 100644 index 0000000..378b029 --- /dev/null +++ b/src/iranlowo/corpus/bibeli.py @@ -0,0 +1,12 @@ +from iranlowo.corpus import Corpus + + +class BibeliCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(BibeliCorpus, self).__init__(path=self.path, **kwargs) + diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index cb47cc5..4e20a42 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -2,36 +2,33 @@ import os import sys -import requests from gensim import interfaces from gensim.corpora.csvcorpus import CsvCorpus -from gensim.corpora.textcorpus import lower_to_unicode, strip_multiple_whitespaces, walk -from gensim.utils import deaccent +from gensim.corpora.textcorpus import walk -from iranlowo.preprocessing import is_valid_owé_format +from iranlowo.preprocessing import is_valid_owé_format, normalize_diacritics_text from iranlowo.utils import is_text_nfc class Corpus(interfaces.CorpusABC): - def __init__(self, path=None, text=None, is_url=False, rlist=False, stream=False, fformat='txt', cformat=None, labels=False, preprocess=False): + def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None): """ Args: path: text: - **kwargs: """ self.path = path self.text = text - self.rlist = rlist self.labels = labels self.stream = stream self.fformat = fformat - self.preprocess = preprocess self.cformat = cformat - self.is_url = is_url - self.data = text if text else self.read_file_or_filename() + self.preprocess = preprocess + if not self.preprocess: + self.preprocess = [normalize_diacritics_text] + self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text() self.validate_format() def __iter__(self): @@ -41,9 +38,6 @@ def __iter__(self): def __len__(self): return len(self.data) - def get_data(self): - pass - @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): pass @@ -55,29 +49,24 @@ def streamfile(self, fobj): num_text += 1 yield line - def read_file_or_filename(self, f=None): + def read_file_filename_or_text(self, f=None, text=None): """ Returns: """ path = f if f else self.path - text = None - print(len(self.path)) out = [] - if isinstance(path, list): + if text: + return self.handle_preprocessing(text) if self.preprocess else text + elif isinstance(path, list): for f in path: path.remove(f) sys.setrecursionlimit(10000) - text = self.read_file_or_filename(f) + text = self.read_file_filename_or_text(f) out.append(text) else: - if self.is_url: - r = requests.get(path) - if r.status_code in [200, 201]: - text = r.text - return text - elif isinstance(path, str): + if isinstance(path, str): if self.fformat == "txt": text = open(path) elif self.fformat == "csv": @@ -87,22 +76,15 @@ def read_file_or_filename(self, f=None): else: text = self.path.seek(0) - if not self.stream: - text = text.read() if not self.rlist else text.readlines() - print(text) - if self.preprocess: - text = self.handle_preprocessing(text) - return text - else: - self.streamfile(text) + text = text.read() if not self.stream else ''.join(list(self.streamfile(text))) + return self.handle_preprocessing(text) if self.preprocess else text def handle_preprocessing(self, text): if callable(self.preprocess): return self.preprocess(text) if isinstance(self.preprocess, list): - prep_list = self.preprocess if isinstance(self.preprocess, list) else [lower_to_unicode, deaccent, strip_multiple_whitespaces] - for technique in prep_list: - text = technique(self.data) + for technique in self.preprocess: + text = technique(text) return text def validate_format(self): @@ -113,7 +95,7 @@ def validate_format(self): """ data = self.data if isinstance(data, list): - data = ' '.join(data) + data = ''.join(data) if not self.cformat and not is_text_nfc(data): raise TypeError("The corpus does not comply to the NFC corpus format") elif self.cformat == "owe": @@ -148,4 +130,3 @@ def __init__(self, path, **kwargs): def read_files(self): for path in self.flist: yield os.path.join(self.path_dir, path) - diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py new file mode 100644 index 0000000..e69de29 diff --git a/src/iranlowo/corpus/yor_blog.py b/src/iranlowo/corpus/yor_blog.py new file mode 100644 index 0000000..173b8f5 --- /dev/null +++ b/src/iranlowo/corpus/yor_blog.py @@ -0,0 +1,12 @@ +from iranlowo.corpus import Corpus + + +class YorubaBlogCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs) + diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py new file mode 100644 index 0000000..5ba518e --- /dev/null +++ b/src/iranlowo/interfaces.py @@ -0,0 +1,14 @@ +import scrapy + + +class Scrapper(scrapy.Spider): + """ + Interface for scrapping data from :mod:`iranlowo.scrapper` + """ + + def __init__(self, name, urls, **kwargs): + super(Scrapper, self).__init__(name, **kwargs) + + def parse(self, response): + pass + diff --git a/tests/pred.txt b/tests/pred.txt new file mode 100644 index 0000000..fa84c75 --- /dev/null +++ b/tests/pred.txt @@ -0,0 +1 @@ +ṣùgbọ́n diff --git a/tests/test_adr.py b/tests/test_adr.py index 8bd618d..7c9c646 100644 --- a/tests/test_adr.py +++ b/tests/test_adr.py @@ -1,131 +1,123 @@ # -*- coding: utf-8 -*- import filecmp -import os +import iranlowo.adr as ránlọ +from iranlowo import utils +from iranlowo import preprocessing +import unittest + +from tests.utils import datapath -from iranlowo.adr import diacritize_text -from iranlowo.preprocessing import strip_accents_text, strip_accents_file, normalize_diacritics_file -from iranlowo.utils import is_file_nfc, is_text_nfc, file_info +class IranlowoADRTest(unittest.TestCase): + + def test_strip_accents_text(self): + ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889" + yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà" + yo_1 = "Kí ó tó di ààrẹ" + + self.assertEqual(utils.strip_accents_text(ca_fr), "Montreal, uber, 12.89, Mere, Francoise, noel, 889") + self.assertEqual(utils.strip_accents_text(yo_0), "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika") + self.assertEqual(utils.strip_accents_text(yo_1), "Ki o to di aare") + def test_strip_accents_file(self): + src_filepath = datapath('src_file.txt') + reference_stripped_filepath = datapath('ref_proccessed_file.txt') + processed_stripped_filepath = datapath('processed_file.txt') -def test_strip_accents_text(): - ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889" - yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà" - yo_1 = "Kí ó tó di ààrẹ" + self.assertTrue(preprocessing.strip_accents_file(src_filepath, processed_stripped_filepath)) + self.assertFalse(filecmp.cmp(src_filepath, processed_stripped_filepath)) + self.assertTrue(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath)) - assert strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889" - assert strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika" - assert strip_accents_text(yo_1) == "Ki o to di aare" + def test_is_file_nfc(self): + src_filepath_pass = datapath('nfc.txt') + src_filepath_fail = datapath('nfc_fail.txt') + self.assertTrue(utils.is_file_nfc(src_filepath_pass)) + self.assertFalse(utils.is_file_nfc(src_filepath_fail)) -def test_strip_accents_file(): - cwd = os.getcwd() - src_filepath = cwd + "/tests/testdata/src_file.txt" - reference_stripped_filepath = cwd + "/tests/testdata/ref_proccessed_file.txt" - processed_stripped_filepath = cwd + "/tests/testdata/processed_file.txt" + def test_is_text_nfc(self): + self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) + self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) - assert(strip_accents_file(src_filepath, processed_stripped_filepath) is True) # job completed - assert(filecmp.cmp(src_filepath, processed_stripped_filepath) is False) # src & processed are different - assert(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath)) # processed matches reference + self.assertTrue(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è')) + self.assertFalse(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è')) + def test_normalize_diacritics_file(self): + nfd_filepath = datapath('nfd.txt') + reference_nfc_filepath = datapath('nfc.txt') + processed_nfc_filepath = datapath('processed_nfc.txt') -def test_is_file_nfc(): - cwd = os.getcwd() - src_filepath_pass = cwd + "/tests/testdata/nfc.txt" - src_filepath_fail = cwd + "/tests/testdata/nfc_fail.txt" - assert (is_file_nfc(src_filepath_pass) is True) - assert (is_file_nfc(src_filepath_fail) is False) + self.assertTrue(preprocessing.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath)) + self.assertFalse(filecmp.cmp(nfd_filepath, processed_nfc_filepath)) # src & processed are different + self.assertTrue(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath)) # processed matches reference + def test_file_info(self): + reference_nfc_filepath = datapath('nfc.txt') + utils.file_info(reference_nfc_filepath) -def test_is_text_nfc(): - assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False) # NFD - assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True) # NFC - - # cover diacritics that have both accents and underdots - assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False) # NFD - assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True) # NFC + # reference_nfc_filepath + # def test_split_corpus_on_symbol(self): + # cwd = os.getcwd() + # multiline_filepath = "/tests/testdata/multiline.txt" + # reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt" + # processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt" + # + # assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath, + # reference_multiline_split_filepath, ',') is True) # job completed + # assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False) # src & processed are different + # assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True) # processed matches reference + # + # # try different punctuation ',', ':', etc? -def test_normalize_diacritics_file(): - cwd = os.getcwd() - nfd_filepath = cwd + "/tests/testdata/nfd.txt" - reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt" - processed_nfc_filepath = cwd + "/tests/testdata/processed_nfc.txt" + def test_diacritize_text(self): + predictions = ránlọ.diacritize_text("leyin igba naa") + self.assertEqual(predictions, "lẹ́yìn ìgbà náà") # generated matches reference + self.assertNotEqual(predictions, "lẹ́yìn igbà náà") # generated does not match incorrect reference - assert(normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True) # job completed - assert(filecmp.cmp(nfd_filepath, processed_nfc_filepath) is False) # src & processed are different - assert(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath) is True) # processed matches reference + predictions = ránlọ.diacritize_text("obinrin") + self.assertEqual(predictions, "obìnrin") # generated matches reference + self.assertNotEqual(predictions, "obinrin") # generated does not match incorrect reference + predictions = ránlọ.diacritize_text("okunrin") + self.assertEqual(predictions, "ọkùnrin") # generated matches reference + self.assertNotEqual(predictions, "ọkunrin") # generated does not match incorrect reference -def test_file_info(): - cwd = os.getcwd() - reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt" - file_info(reference_nfc_filepath) + predictions = ránlọ.diacritize_text("orisirisi") + self.assertEqual(predictions, "oríṣiríṣi") # generated matches reference + self.assertNotEqual(predictions, "orísiríṣi") # generated does not match incorrect reference - # reference_nfc_filepath + predictions = ránlọ.diacritize_text("nitori naa") + self.assertEqual(predictions, "nítorí náà") # generated matches reference + self.assertNotEqual(predictions, "nitorí náà") # generated does not match incorrect reference -# def test_split_corpus_on_symbol(): -# cwd = os.getcwd() -# multiline_filepath = "/tests/testdata/multiline.txt" -# reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt" -# processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt" -# -# assert(split_out_corpus_on_symbol(multiline_filepath, -# reference_multiline_split_filepath, ',') is True) # job completed -# assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False) # src & processed are different -# assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True) # processed matches reference -# -# # try different punctuation ',', ':', etc? + predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo") + self.assertEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́") # generated matches reference + self.assertNotEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́") # generated does not match incorrect reference + # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode") + # assert(predictions , "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated matches reference + # assert(predictions , "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated does not match incorrect reference -def test_diacritize_text(): + predictions = ránlọ.diacritize_text("ati beebee lo") + self.assertEqual(predictions, "àti bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated matches reference + self.assertNotEqual(predictions, "ati bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated does not match incorrect reference - predictions = diacritize_text("leyin igba naa") - assert(predictions == "lẹ́yìn ìgbà náà") # generated matches reference - assert(predictions != "lẹ́yìn igbà náà") # generated does not match incorrect reference + predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari") + self.assertEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí") # generated matches reference + self.assertNotEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí") # generated does not match incorrect reference - predictions = diacritize_text("obinrin") - assert(predictions == "obìnrin") # generated matches reference - assert(predictions != "obinrin") # generated does not match incorrect reference + # predictions = ránlọ.diacritize_text("bi ase nlo yii") + # assert(predictions , "bí aṣe ńlọ yìí") # generated matches reference + # assert(predictions , "bí ase ńlọ yìí") # generated does not match incorrect reference + + predictions = ránlọ.diacritize_text("o dabi pe") + self.assertEqual(predictions, "ó dàbí pé") # generated matches reference + self.assertNotEqual(predictions, "ó dàbí pe") # generated does not match incorrect reference + + predictions = ránlọ.diacritize_text("sugbon") + self.assertEqual(predictions, "ṣùgbọ́n") # generated matches reference + self.assertNotEqual(predictions, "ṣugbọ́n") # generated does not match incorrect reference - predictions = diacritize_text("okunrin") - assert(predictions == "ọkùnrin") # generated matches reference - assert(predictions != "ọkunrin") # generated does not match incorrect reference - - predictions = diacritize_text("orisirisi") - assert(predictions == "oríṣiríṣi") # generated matches reference - assert(predictions != "orísiríṣi") # generated does not match incorrect reference - - predictions = diacritize_text("nitori naa") - assert(predictions == "nítorí náà") # generated matches reference - assert(predictions != "nitorí náà") # generated does not match incorrect reference - - predictions = diacritize_text("leyin oro mi won ko tun soro mo") - assert(predictions == "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́") # generated matches reference - assert(predictions != "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́") # generated does not match incorrect reference - - # predictions = diacritize_text("awon okunrin nse ise agbara bi ise ode") - # assert(predictions == "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated matches reference - # assert(predictions != "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ") # generated does not match incorrect reference - - predictions = diacritize_text("ati beebee lo") - assert(predictions == "àti bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated matches reference - assert(predictions != "ati bẹ́ẹ̀bẹ́ẹ̀ lọ") # generated does not match incorrect reference - - predictions = diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari") - assert(predictions == "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí") # generated matches reference - assert(predictions != "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí") # generated does not match incorrect reference - - # predictions = diacritize_text("bi ase nlo yii") - # assert(predictions == "bí aṣe ńlọ yìí") # generated matches reference - # assert(predictions != "bí ase ńlọ yìí") # generated does not match incorrect reference - - predictions = diacritize_text("o dabi pe") - assert(predictions == "ó dàbí pé") # generated matches reference - assert(predictions != "ó dàbí pe") # generated does not match incorrect reference - - predictions = diacritize_text("sugbon") - assert(predictions == "ṣùgbọ́n") # generated matches reference - assert(predictions != "ṣugbọ́n") # generated does not match incorrect reference diff --git a/tests/test_corpus.py b/tests/test_corpus.py index e69de29..73fd4db 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -0,0 +1,54 @@ +import string +import unittest + +from iranlowo import corpus +from tests.utils import datapath + + +class TestTextCorpus(unittest.TestCase): + def setUp(self): + self.corpus_class = corpus.Corpus + self.txt_extension = 'txt' + self.csv_extension = 'csv' + self.gzip_extension = 'gzip' + + def test_load_corpus_from_path(self): + path = datapath('owe_pass') + corpus = self.corpus_class(path=path, fformat=self.txt_extension) + self.assertEqual(len(corpus), 420) + + def test_load_corpus_from_path_stream(self): + path = datapath('owe_pass') + corpus = self.corpus_class(path=path, fformat=self.txt_extension, stream=True) + self.assertEqual(len(corpus), 420) + + def test_load_corpus_from_text(self): + text = open(datapath('owe_pass')).read() + corpus = self.corpus_class(text=text) + self.assertEqual(len(corpus), 420) + + def test_load_corpus_with_preprocessing(self): + lines = [ + "Àwọn obìnrin, wọn ní kiní agbára yẹn lórí àwọn ọkùnrin?", + "Ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?", + "eranko wo lo buru julo" + ] + expected = [ + 'Àwọn obìnrin wọn ní kiní agbára yẹn lórí àwọn ọkùnrin', + "ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?", + 'erankowoloburujulo' + ] + + def punctuations(text): return text.translate(str.maketrans('', '', string.punctuation)) + + preprocessing = [ + lambda x: punctuations(x), lambda x: x.lower(), lambda x: x.replace(' ', '') + ] + + for index, entry in enumerate(lines): + corpus = self.corpus_class(text=entry, preprocess=preprocessing[index]) + self.assertEqual(corpus.data, expected[index]) + + def test_save(self): + pass + diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 3203e18..f172586 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,10 +1,13 @@ import os +import unittest from iranlowo import preprocessing -def test_is_valid_owe_format(): - cwd = os.getcwd() - fail_path = cwd + "/tests/testdata/nfc.txt" +class IranlowoCorpusTest(unittest.TestCase): - assert preprocessing.is_valid_owé_format(fail_path) is False + def test_is_valid_owe_format(self): + cwd = os.getcwd() + fail_path = cwd + "/tests/testdata/nfc.txt" + + assert preprocessing.is_valid_owé_format(fail_path) is False diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..a152c24 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,7 @@ +import os + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder + + +def datapath(fname): + return os.path.join(module_path, 'testdata', fname) From bff1c65765b67a6afd4ee983800734911fec8f42 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sat, 13 Jul 2019 11:47:12 +0100 Subject: [PATCH 3/6] Added preprocessing test --- tests/test_preprocessing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index f172586..eb10b1b 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -2,12 +2,11 @@ import unittest from iranlowo import preprocessing +from tests.utils import datapath class IranlowoCorpusTest(unittest.TestCase): def test_is_valid_owe_format(self): - cwd = os.getcwd() - fail_path = cwd + "/tests/testdata/nfc.txt" - - assert preprocessing.is_valid_owé_format(fail_path) is False + fail_path = datapath('nfc.txt') + self.assertFalse(preprocessing.is_valid_owé_format(fail_path)) From a07cb289f3961c73e31a51c410adc3fe2f7a578f Mon Sep 17 00:00:00 2001 From: Olamyy Date: Tue, 16 Jul 2019 00:05:37 +0100 Subject: [PATCH 4/6] Introducing Corpus Module --- src/iranlowo/corpus/__init__.py | 1 + src/iranlowo/corpus/bbc_yoruba.py | 13 ------ src/iranlowo/corpus/bibeli.py | 12 ------ src/iranlowo/corpus/corpus.py | 22 +++++------ src/iranlowo/corpus/loaders.py | 66 +++++++++++++++++++++++++++++++ src/iranlowo/corpus/yor_blog.py | 12 ------ src/iranlowo/utils.py | 6 +++ tests/test_corpus.py | 15 +++++++ tests/test_loaders.py | 13 ++++++ tests/test_preprocessing.py | 1 - tests/testdata/dirdata/yo_000.txt | 1 + tests/testdata/dirdata/yo_001.txt | 1 + tests/testdata/dirdata/yo_002.txt | 1 + tests/testdata/dirdata/yo_003.txt | 1 + tests/testdata/dirdata/yo_004.txt | 1 + tests/testdata/dirdata/yo_005.txt | 1 + tests/testdata/dirdata/yo_006.txt | 1 + tests/testdata/dirdata/yo_007.txt | 1 + tests/testdata/dirdata/yo_008.txt | 1 + tests/utils.py | 4 +- 20 files changed, 124 insertions(+), 50 deletions(-) delete mode 100644 src/iranlowo/corpus/bbc_yoruba.py delete mode 100644 src/iranlowo/corpus/bibeli.py create mode 100644 src/iranlowo/corpus/loaders.py delete mode 100644 src/iranlowo/corpus/yor_blog.py create mode 100644 tests/test_loaders.py create mode 100644 tests/testdata/dirdata/yo_000.txt create mode 100644 tests/testdata/dirdata/yo_001.txt create mode 100644 tests/testdata/dirdata/yo_002.txt create mode 100644 tests/testdata/dirdata/yo_003.txt create mode 100644 tests/testdata/dirdata/yo_004.txt create mode 100644 tests/testdata/dirdata/yo_005.txt create mode 100644 tests/testdata/dirdata/yo_006.txt create mode 100644 tests/testdata/dirdata/yo_007.txt create mode 100644 tests/testdata/dirdata/yo_008.txt diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py index fc5a0d8..4201d4e 100644 --- a/src/iranlowo/corpus/__init__.py +++ b/src/iranlowo/corpus/__init__.py @@ -1 +1,2 @@ from .corpus import Corpus, DirectoryCorpus +from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus \ No newline at end of file diff --git a/src/iranlowo/corpus/bbc_yoruba.py b/src/iranlowo/corpus/bbc_yoruba.py deleted file mode 100644 index 004d6da..0000000 --- a/src/iranlowo/corpus/bbc_yoruba.py +++ /dev/null @@ -1,13 +0,0 @@ -from iranlowo.corpus import Corpus - - -class BBCCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(BBCCorpus, self).__init__(path=self.path, **kwargs) - super().__init__(path) - diff --git a/src/iranlowo/corpus/bibeli.py b/src/iranlowo/corpus/bibeli.py deleted file mode 100644 index 378b029..0000000 --- a/src/iranlowo/corpus/bibeli.py +++ /dev/null @@ -1,12 +0,0 @@ -from iranlowo.corpus import Corpus - - -class BibeliCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(BibeliCorpus, self).__init__(path=self.path, **kwargs) - diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index 4e20a42..81fa2ff 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -1,7 +1,5 @@ import gzip import os -import sys - from gensim import interfaces from gensim.corpora.csvcorpus import CsvCorpus @@ -26,6 +24,7 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No self.fformat = fformat self.cformat = cformat self.preprocess = preprocess + assert self.path or self.text, "You should pass either a path or text to read data from." if not self.preprocess: self.preprocess = [normalize_diacritics_text] self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text() @@ -61,10 +60,9 @@ def read_file_filename_or_text(self, f=None, text=None): return self.handle_preprocessing(text) if self.preprocess else text elif isinstance(path, list): for f in path: - path.remove(f) - sys.setrecursionlimit(10000) text = self.read_file_filename_or_text(f) out.append(text) + return out else: if isinstance(path, str): if self.fformat == "txt": @@ -119,14 +117,16 @@ def generate(self, size): class DirectoryCorpus(Corpus): def __init__(self, path, **kwargs): - self.path_dir = path - walked = list(walk(self.path_dir)) - self.depth = walked[0][0] - self.dirnames = walked[0][2] - self.flist = walked[0][3] + self.dir_path = path + self.depth = kwargs.get('min_depth', 0) self.path = list(self.read_files()) super(DirectoryCorpus, self).__init__(path=self.path, **kwargs) def read_files(self): - for path in self.flist: - yield os.path.join(self.path_dir, path) + walked = list(walk(self.dir_path)) + if not walked: + raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path)) + for depth, dirpath, _, filenames in walked: + if self.depth <= depth: + for path in filenames: + yield os.path.join(dirpath, path) diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py new file mode 100644 index 0000000..1314af6 --- /dev/null +++ b/src/iranlowo/corpus/loaders.py @@ -0,0 +1,66 @@ +import os + +from iranlowo.corpus import Corpus, DirectoryCorpus + + +class BaseLoader(object): + def __init__(self, corpus_path): + self.corpus_path = corpus_path + yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None) + if not yoruba_text_path: + raise NotADirectoryError( + "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's " + "path") + else: + corpus_path = "{}/{}".format(yoruba_text_path, corpus_path) + self.path = corpus_path + + +class YorubaBlogCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs) + + +class BBCCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(BBCCorpus, self).__init__(path=self.path, **kwargs) + super().__init__(path) + + +class BibeliCorpus(Corpus): + def __init__(self, path): + """ + + Args: + path: + """ + super(BibeliCorpus, self).__init__(path=self.path, **kwargs) + + +class en(BaseLoader, DirectoryCorpus): + def __init__(self): + BaseLoader.__init__(self, corpus_path="Owe/en") + DirectoryCorpus.__init__(self, path=self.path) + + +class yo(BaseLoader, DirectoryCorpus): + def __init__(self): + BaseLoader.__init__(self, corpus_path="Owe/yo") + DirectoryCorpus.__init__(self, path=self.path) + + +class OweLoader(object): + def __init__(self): + self.en = en() + self.yo = yo() + diff --git a/src/iranlowo/corpus/yor_blog.py b/src/iranlowo/corpus/yor_blog.py deleted file mode 100644 index 173b8f5..0000000 --- a/src/iranlowo/corpus/yor_blog.py +++ /dev/null @@ -1,12 +0,0 @@ -from iranlowo.corpus import Corpus - - -class YorubaBlogCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs) - diff --git a/src/iranlowo/utils.py b/src/iranlowo/utils.py index a861d32..f4ab2c5 100644 --- a/src/iranlowo/utils.py +++ b/src/iranlowo/utils.py @@ -2,6 +2,8 @@ import unicodedata from collections import defaultdict +from pathlib import Path + from iranlowo.preprocessing import strip_accents_text @@ -27,6 +29,10 @@ def is_text_nfc(text): return False +def string_to_path(string): + return Path(string) + + def file_info(filename): """File metadata useful for various ADR tasks""" diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 73fd4db..4ebc36e 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -1,5 +1,6 @@ import string import unittest +from pathlib import Path from iranlowo import corpus from tests.utils import datapath @@ -8,6 +9,7 @@ class TestTextCorpus(unittest.TestCase): def setUp(self): self.corpus_class = corpus.Corpus + self.directory_loader = corpus.DirectoryCorpus self.txt_extension = 'txt' self.csv_extension = 'csv' self.gzip_extension = 'gzip' @@ -49,6 +51,19 @@ def punctuations(text): return text.translate(str.maketrans('', '', string.punct corpus = self.corpus_class(text=entry, preprocess=preprocessing[index]) self.assertEqual(corpus.data, expected[index]) + def test_load_corpus_from_directory(self): + direc = datapath('dirdata') + invalid_dir = datapath('test_data') + multi_dir = datapath() + path = Path(direc).glob('*') + dir_corpus = self.directory_loader(path=direc) + self.assertEqual(len(dir_corpus.data), len(list(path))) + with self.assertRaises(NotADirectoryError): + self.directory_loader(path=invalid_dir) + multi_corp = self.directory_loader(path=multi_dir) + multi_path = Path(multi_dir).glob('**/*') + self.assertEqual(len(multi_corp.data), len(list(multi_path))-1) + def test_save(self): pass diff --git a/tests/test_loaders.py b/tests/test_loaders.py new file mode 100644 index 0000000..beffa3e --- /dev/null +++ b/tests/test_loaders.py @@ -0,0 +1,13 @@ +import unittest +import os + +from iranlowo import corpus + + +class TestCoprusLoader(unittest.TestCase): + def setUp(self): + self.owe_loader = corpus.OweLoader + + def test_load_owe(self): + with self.assertRaises(NotADirectoryError): + self.owe_loader() diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index eb10b1b..84dc379 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,4 +1,3 @@ -import os import unittest from iranlowo import preprocessing diff --git a/tests/testdata/dirdata/yo_000.txt b/tests/testdata/dirdata/yo_000.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/testdata/dirdata/yo_000.txt @@ -0,0 +1 @@ + diff --git a/tests/testdata/dirdata/yo_001.txt b/tests/testdata/dirdata/yo_001.txt new file mode 100644 index 0000000..ab97739 --- /dev/null +++ b/tests/testdata/dirdata/yo_001.txt @@ -0,0 +1 @@ +A di gàárì sílẹ̀ ewúrẹ́ ńyọjú; ẹrù ìran rẹ̀ ni? diff --git a/tests/testdata/dirdata/yo_002.txt b/tests/testdata/dirdata/yo_002.txt new file mode 100644 index 0000000..87d7002 --- /dev/null +++ b/tests/testdata/dirdata/yo_002.txt @@ -0,0 +1 @@ +A fi ọ́ jọba ò ńṣàwúre o fẹ́ jẹ Ọlọ́run ni? diff --git a/tests/testdata/dirdata/yo_003.txt b/tests/testdata/dirdata/yo_003.txt new file mode 100644 index 0000000..822cd91 --- /dev/null +++ b/tests/testdata/dirdata/yo_003.txt @@ -0,0 +1 @@ +A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, bí a ò bá jà, bí a bá ti gba Awà, kò tán bí? diff --git a/tests/testdata/dirdata/yo_004.txt b/tests/testdata/dirdata/yo_004.txt new file mode 100644 index 0000000..c08f5e6 --- /dev/null +++ b/tests/testdata/dirdata/yo_004.txt @@ -0,0 +1 @@ +A gbé gàárì ọmọ ewurẹ ńrojú; kì í ṣe ẹrù àgùntàn. diff --git a/tests/testdata/dirdata/yo_005.txt b/tests/testdata/dirdata/yo_005.txt new file mode 100644 index 0000000..19a221e --- /dev/null +++ b/tests/testdata/dirdata/yo_005.txt @@ -0,0 +1 @@ +A kì í bá ọba pàlà kí ọkọ́ ọba má ṣàn-ánni lẹ́sẹ̀. diff --git a/tests/testdata/dirdata/yo_006.txt b/tests/testdata/dirdata/yo_006.txt new file mode 100644 index 0000000..35113f5 --- /dev/null +++ b/tests/testdata/dirdata/yo_006.txt @@ -0,0 +1 @@ +A kì í bínú ààtàn ká dalẹ̀ sígbẹ̀ẹ́. diff --git a/tests/testdata/dirdata/yo_007.txt b/tests/testdata/dirdata/yo_007.txt new file mode 100644 index 0000000..695bae3 --- /dev/null +++ b/tests/testdata/dirdata/yo_007.txt @@ -0,0 +1 @@ +A kì í bínú orí ká fi fìlà dé ìbàdí. diff --git a/tests/testdata/dirdata/yo_008.txt b/tests/testdata/dirdata/yo_008.txt new file mode 100644 index 0000000..5123008 --- /dev/null +++ b/tests/testdata/dirdata/yo_008.txt @@ -0,0 +1 @@ +A kì í bẹ̀rù ikú bẹ̀rù àrùn ká ní kí ọmọ ó kú sinni. diff --git a/tests/utils.py b/tests/utils.py index a152c24..8f8ed14 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,5 +3,7 @@ module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -def datapath(fname): +def datapath(fname=None): + if not fname: + return os.path.join(module_path, 'testdata') return os.path.join(module_path, 'testdata', fname) From 7d891c49d2da323bd52706747beccdef4a7f9ab3 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Tue, 16 Jul 2019 09:44:29 +0100 Subject: [PATCH 5/6] Fixing text --- requirements.txt | 3 ++- tests/test_loaders.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index cdf04af..5441c7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -gensim bs4 configargparse torch numpy requests tqdm +google-compute-engine +gensim diff --git a/tests/test_loaders.py b/tests/test_loaders.py index beffa3e..7468a35 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -1,5 +1,4 @@ import unittest -import os from iranlowo import corpus From f56b6f06e682a19e4c0e55a6a090d11e16f6f02c Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 21 Jul 2019 16:33:29 +0100 Subject: [PATCH 6/6] Checking if __init__ solves the failing tests. --- tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29