From 23e4594fc141f0c284c14d6129c14f62d799428e Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Fri, 12 Jul 2019 16:00:10 +0100
Subject: [PATCH 1/6] Introduced a corpus module

---
 src/iranlowo/adr.py             | 271 --------------------------------
 src/iranlowo/corpus/__init__.py |   0
 src/iranlowo/corpus/corpus.py   | 151 ++++++++++++++++++
 src/iranlowo/preprocessing.py   | 128 +++++++++++++++
 src/iranlowo/utils.py           | 136 ++++++++++++++++
 tests/test_adr.py               |  56 ++++---
 tests/test_corpus.py            |   0
 7 files changed, 445 insertions(+), 297 deletions(-)
 create mode 100644 src/iranlowo/corpus/__init__.py
 create mode 100644 src/iranlowo/corpus/corpus.py
 create mode 100644 src/iranlowo/utils.py
 create mode 100644 tests/test_corpus.py

diff --git a/src/iranlowo/adr.py b/src/iranlowo/adr.py
index 1012213..e57e730 100644
--- a/src/iranlowo/adr.py
+++ b/src/iranlowo/adr.py
@@ -4,272 +4,11 @@
 from __future__ import unicode_literals
 
 import pkg_resources
-import re
-import unicodedata
-
 from argparse import Namespace
-from collections import defaultdict
 from onmt.translate.translator import build_translator
 from onmt.utils.parse import ArgumentParser
 
 
-def strip_accents_text(text_string):
-    """
-    Converts the string to NFD, separates & returns only the base characters
-    :param text_string:
-    :return: input string without diacritic adornments on base characters
-    """
-    return "".join(
-        c
-        for c in unicodedata.normalize("NFD", text_string)
-        if unicodedata.category(c) != "Mn"
-    )
-
-
-def strip_accents_file(filename, outfilename):
-    """
-    Reads filename containing diacritics, converts to NFC for consistency,
-    then writes outfilename with diacritics removed
-    :param filename:
-    :param outfilename:
-    :return: None
-    """
-    text = "".join(
-        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
-    )
-    try:
-        f = open(outfilename, "w")
-    except EnvironmentError:
-        return False
-    else:
-        with f:
-            f.write(strip_accents_text(text))
-        return True
-
-
-def is_file_nfc(path):
-    """
-
-    Args:
-        path: File path
-
-    Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct
-
-    """
-    text = open(path).read()
-    return is_text_nfc(text)
-
-
-def is_text_nfc(text):
-    """Validate unicode form of given text"""
-    nfc_text = "".join(c for c in unicodedata.normalize("NFC", text))
-    if nfc_text == text:
-        return True
-    else:
-        return False
-
-
-def normalize_diacritics_text(text_string):
-    """Convenience wrapper to abstract away unicode & NFC"""
-    return unicodedata.normalize("NFC", text_string)
-
-
-def normalize_diacritics_file(filename, outfilename):
-    """File based Convenience wrapper to abstract away unicode & NFC"""
-    try:
-        text = "".join(
-            c
-            for c in unicodedata.normalize(
-                "NFC", open(filename, encoding="utf-8").read()
-            )
-        )
-        with open(outfilename, "w", encoding="utf-8") as f:
-            f.write(text)
-    except EnvironmentError:
-        return False
-    else:
-        return True
-
-
-def file_info(filename):
-    """File metadata useful for various ADR tasks"""
-
-    print("\nFilename: " + filename)
-    print("---------------------------------")
-
-    lines = tuple(open(filename, "r", encoding="utf-8"))
-    num_utts = len(lines)
-
-    text = "".join(
-        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
-    )
-    words = re.findall("\w+", text)
-    num_words = len(words)
-    num_chars = len(re.findall(r"\S", text))
-
-    unique_chars = set(text)
-    num_uniq_chars = len(unique_chars)
-
-    print(sorted(unique_chars))
-    print("# utts      : " + str(num_utts))
-    print("# chars     : " + str(num_chars))
-    print("# uniq chars: " + str(num_uniq_chars))
-
-    # unaccented word stats
-    unaccented_words = 0
-    for word in words:
-        if word == strip_accents_text(word):
-            unaccented_words += 1
-
-    print("# total words: " + str(num_words))
-    print("# unaccented words : " + str(unaccented_words))
-    print("-----------------------------------------------")
-
-    # ambiguous word stats
-    ambiguity_map = defaultdict(set)
-    for word in words:
-        no_accents = strip_accents_text(word)
-        ambiguity_map[no_accents].add(word)
-
-    ambiguous_words = 0
-    ambiguous_words_2 = 0
-    ambiguous_words_3 = 0
-    ambiguous_words_4 = 0
-    ambiguous_words_5 = 0
-    ambiguous_words_6 = 0
-    ambiguous_words_7 = 0
-    ambiguous_words_8 = 0
-    ambiguous_words_9 = 0
-
-    # fill ambiguity map
-    for word in ambiguity_map:
-        if len(ambiguity_map[word]) > 1:
-            ambiguous_words += 1
-        if len(ambiguity_map[word]) == 2:
-            ambiguous_words_2 += 1
-        elif len(ambiguity_map[word]) == 3:
-            ambiguous_words_3 += 1
-        elif len(ambiguity_map[word]) == 4:
-            ambiguous_words_4 += 1
-        elif len(ambiguity_map[word]) == 5:
-            ambiguous_words_5 += 1
-        elif len(ambiguity_map[word]) == 6:
-            ambiguous_words_6 += 1
-        elif len(ambiguity_map[word]) == 7:
-            ambiguous_words_7 += 1
-        elif len(ambiguity_map[word]) == 8:
-            ambiguous_words_8 += 1
-        elif len(ambiguity_map[word]) == 9:
-            ambiguous_words_9 += 1
-
-    # print ambiguity map
-    for word in ambiguity_map:
-        if len(ambiguity_map[word]) == 2:
-            print("# 2: " + str(ambiguity_map[word]))
-        if len(ambiguity_map[word]) == 3:
-            print("# 3: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 4:
-            print("# 4: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 5:
-            print("# 5: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 6:
-            print("# 6: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 7:
-            print("# 7: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 8:
-            print("# 8: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 9:
-            print("# 9: " + str(ambiguity_map[word]))
-
-    print("# unique ambiguous words : " + str(ambiguous_words))
-    print("# total unique non-diacritized words : " + str(len(ambiguity_map)))
-
-    unique_all_words = set()
-    for word in words:
-        unique_all_words.add(word)
-
-    print("# total unique words : " + str(len(unique_all_words)))
-    print("-----------------------------------------------")
-    print("# ambiguous 2 words : " + str(ambiguous_words_2))
-    print("# ambiguous 3 words : " + str(ambiguous_words_3))
-    print("# ambiguous 4 words : " + str(ambiguous_words_4))
-    print("# ambiguous 5 words : " + str(ambiguous_words_5))
-    print("# ambiguous 6 words : " + str(ambiguous_words_6))
-    print("# ambiguous 7 words : " + str(ambiguous_words_7))
-    print("# ambiguous 8 words : " + str(ambiguous_words_8))
-    print("# ambiguous 9 words : " + str(ambiguous_words_9))
-
-
-def split_corpus_on_symbol(filename, outfilename, symbol=","):
-    """ 
-    For yoruba blog (and probably bibeli mimo)
-
-    Args: filenames for I/O and symbol to split lines on
-    Returns: writes outputfile
-    :param filename: input file
-    :param outfilename: processed output file to write
-    :param symbol: to split lines on
-    :return: None, with side-effect of writing an outputfile
-    """
-
-    lines = tuple(open(filename, "r", encoding="utf-8"))
-
-    min_words_to_split = 10
-    min_words_in_utt = 5
-
-    with open(outfilename, "w") as f:
-        # split out heavily comma'd text :((
-        for line in lines:
-            if symbol in line:
-                num_words = len(line.split())
-                num_commas = line.count(symbol)
-                curr_comma_position = line.index(symbol)
-                num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())
-
-                curr_line = line
-                while num_commas > 0:
-                    if num_words < min_words_to_split:
-                        # print(curr_line.strip())
-                        f.write(curr_line)
-                        break
-                    if num_words >= min_words_to_split:
-                        if (
-                                num_words_ahead_of_curr_comma >= min_words_in_utt
-                                and len((curr_line)[curr_comma_position:].split())
-                                >= min_words_in_utt
-                        ):
-                            f.write((curr_line)[0:curr_comma_position] + "\n")
-
-                            # update vars
-                            curr_line = curr_line[curr_comma_position + 1:]
-                            num_words = len(curr_line.split())
-                            num_commas = num_commas - 1
-                            if num_commas > 0:
-                                curr_comma_position = curr_line.index(symbol)
-                                num_words_ahead_of_curr_comma = len(
-                                    curr_line[0:curr_comma_position].split()
-                                )
-                            else:
-                                f.write(curr_line)
-                        else:
-                            # ignore too short comma (+= vs = on current comma position)
-                            num_commas = num_commas - 1
-                            if num_commas > 0:  # for say 3 commas
-                                curr_comma_position += (
-                                        curr_line[curr_comma_position + 1:].index(symbol)
-                                        + 1
-                                )
-                                num_words_ahead_of_curr_comma = len(
-                                    curr_line[0:curr_comma_position].split()
-                                )
-                            else:
-                                f.write(curr_line)
-                    else:
-                        f.write(curr_line)
-            else:
-                f.write(line)
-
-
 def diacritize_text(undiacritized_text, verbose=False):
     # manually construct the options so we don't have to pass them in.
     opt = Namespace()
@@ -339,13 +78,3 @@ def diacritize_text(undiacritized_text, verbose=False):
     )
     return prediction[0][0]
 
-
-if __name__ == "__main__":
-    # # test
-    print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))  # NFD
-    print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))  # NFC
-    print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Book_of_Mormon/cleaned/doctrine_and_covenants.txt'))
-
-    print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Owe/yoruba_proverbs_out.txt'))
-
-    # file_info("../../tests/testdata/nfc.txt")
diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
new file mode 100644
index 0000000..cb47cc5
--- /dev/null
+++ b/src/iranlowo/corpus/corpus.py
@@ -0,0 +1,151 @@
+import gzip
+import os
+import sys
+
+import requests
+
+from gensim import interfaces
+from gensim.corpora.csvcorpus import CsvCorpus
+from gensim.corpora.textcorpus import lower_to_unicode, strip_multiple_whitespaces, walk
+from gensim.utils import deaccent
+
+from iranlowo.preprocessing import is_valid_owé_format
+from iranlowo.utils import is_text_nfc
+
+
+class Corpus(interfaces.CorpusABC):
+    def __init__(self, path=None, text=None, is_url=False, rlist=False, stream=False, fformat='txt', cformat=None, labels=False, preprocess=False):
+        """
+
+        Args:
+            path:
+            text:
+            **kwargs:
+        """
+        self.path = path
+        self.text = text
+        self.rlist = rlist
+        self.labels = labels
+        self.stream = stream
+        self.fformat = fformat
+        self.preprocess = preprocess
+        self.cformat = cformat
+        self.is_url = is_url
+        self.data = text if text else self.read_file_or_filename()
+        self.validate_format()
+
+    def __iter__(self):
+        for line in self.data:
+            yield line
+
+    def __len__(self):
+        return len(self.data)
+
+    def get_data(self):
+        pass
+
+    @staticmethod
+    def save_corpus(fname, corpus, id2word=None, metadata=False):
+        pass
+
+    def streamfile(self, fobj):
+        num_text = 0
+        with fobj as obj:
+            for line in obj:
+                num_text += 1
+                yield line
+
+    def read_file_or_filename(self, f=None):
+        """
+
+        Returns:
+
+        """
+        path = f if f else self.path
+        text = None
+        print(len(self.path))
+        out = []
+        if isinstance(path, list):
+            for f in path:
+                path.remove(f)
+                sys.setrecursionlimit(10000)
+                text = self.read_file_or_filename(f)
+                out.append(text)
+        else:
+            if self.is_url:
+                r = requests.get(path)
+                if r.status_code in [200, 201]:
+                    text = r.text
+                    return text
+            elif isinstance(path, str):
+                if self.fformat == "txt":
+                    text = open(path)
+                elif self.fformat == "csv":
+                    text = CsvCorpus(path, self.labels)
+                elif self.fformat == 'gzip':
+                    text = gzip.open(path)
+            else:
+                text = self.path.seek(0)
+
+            if not self.stream:
+                text = text.read() if not self.rlist else text.readlines()
+                print(text)
+                if self.preprocess:
+                    text = self.handle_preprocessing(text)
+                return text
+            else:
+                self.streamfile(text)
+
+    def handle_preprocessing(self, text):
+        if callable(self.preprocess):
+            return self.preprocess(text)
+        if isinstance(self.preprocess, list):
+            prep_list = self.preprocess if isinstance(self.preprocess, list) else [lower_to_unicode, deaccent, strip_multiple_whitespaces]
+            for technique in prep_list:
+                text = technique(self.data)
+            return text
+
+    def validate_format(self):
+        """
+
+        Returns:
+
+        """
+        data = self.data
+        if isinstance(data, list):
+            data = ' '.join(data)
+        if not self.cformat and not is_text_nfc(data):
+            raise TypeError("The corpus does not comply to the NFC corpus format")
+        elif self.cformat == "owe":
+            if not is_valid_owé_format(data):
+                raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat))
+            else:
+                return True
+
+    def generate(self, size):
+        """
+
+        Args:
+            size:
+
+        Returns:
+
+        """
+        if not self.cformat:
+            raise ValueError("You need to specify a format for generating random text")
+
+
+class DirectoryCorpus(Corpus):
+    def __init__(self, path, **kwargs):
+        self.path_dir = path
+        walked = list(walk(self.path_dir))
+        self.depth = walked[0][0]
+        self.dirnames = walked[0][2]
+        self.flist = walked[0][3]
+        self.path = list(self.read_files())
+        super(DirectoryCorpus, self).__init__(path=self.path, **kwargs)
+
+    def read_files(self):
+        for path in self.flist:
+            yield os.path.join(self.path_dir, path)
+
diff --git a/src/iranlowo/preprocessing.py b/src/iranlowo/preprocessing.py
index fe72ee7..73545fe 100644
--- a/src/iranlowo/preprocessing.py
+++ b/src/iranlowo/preprocessing.py
@@ -1,5 +1,6 @@
 import csv
 import gzip
+import unicodedata
 from pathlib import Path
 
 
@@ -90,3 +91,130 @@ def get_chunk(txt, n):
         except IndexError:
             pass  # End of file reached
 
+
+def strip_accents_text(text_string):
+    """
+    Converts the string to NFD, separates & returns only the base characters
+    :param text_string:
+    :return: input string without diacritic adornments on base characters
+    """
+    return "".join(
+        c
+        for c in unicodedata.normalize("NFD", text_string)
+        if unicodedata.category(c) != "Mn"
+    )
+
+
+def strip_accents_file(filename, outfilename):
+    """
+    Reads filename containing diacritics, converts to NFC for consistency,
+    then writes outfilename with diacritics removed
+    :param filename:
+    :param outfilename:
+    :return: None
+    """
+    text = "".join(
+        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
+    )
+    try:
+        f = open(outfilename, "w")
+    except EnvironmentError:
+        return False
+    else:
+        with f:
+            f.write(strip_accents_text(text))
+        return True
+
+
+def normalize_diacritics_text(text_string):
+    """Convenience wrapper to abstract away unicode & NFC"""
+    return unicodedata.normalize("NFC", text_string)
+
+
+def normalize_diacritics_file(filename, outfilename):
+    """File based Convenience wrapper to abstract away unicode & NFC"""
+    try:
+        text = "".join(
+            c
+            for c in unicodedata.normalize(
+                "NFC", open(filename, encoding="utf-8").read()
+            )
+        )
+        with open(outfilename, "w", encoding="utf-8") as f:
+            f.write(text)
+    except EnvironmentError:
+        return False
+    else:
+        return True
+
+
+def split_corpus_on_symbol(filename, outfilename, symbol=","):
+    """
+    For yoruba blog (and probably bibeli mimo)
+
+    Args: filenames for I/O and symbol to split lines on
+    Returns: writes outputfile
+    :param filename: input file
+    :param outfilename: processed output file to write
+    :param symbol: to split lines on
+    :return: None, with side-effect of writing an outputfile
+    """
+
+    lines = tuple(open(filename, "r", encoding="utf-8"))
+
+    min_words_to_split = 10
+    min_words_in_utt = 5
+
+    with open(outfilename, "w") as f:
+        # split out heavily comma'd text :((
+        for line in lines:
+            if symbol in line:
+                num_words = len(line.split())
+                num_commas = line.count(symbol)
+                curr_comma_position = line.index(symbol)
+                num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())
+
+                curr_line = line
+                while num_commas > 0:
+                    if num_words < min_words_to_split:
+                        # print(curr_line.strip())
+                        f.write(curr_line)
+                        break
+                    if num_words >= min_words_to_split:
+                        if (
+                                num_words_ahead_of_curr_comma >= min_words_in_utt
+                                and len(curr_line[curr_comma_position:].split())
+                                >= min_words_in_utt
+                        ):
+                            f.write(curr_line[0:curr_comma_position] + "\n")
+
+                            # update vars
+                            curr_line = curr_line[curr_comma_position + 1:]
+                            num_words = len(curr_line.split())
+                            num_commas = num_commas - 1
+                            if num_commas > 0:
+                                curr_comma_position = curr_line.index(symbol)
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                f.write(curr_line)
+                        else:
+                            # ignore too short comma (+= vs = on current comma position)
+                            num_commas = num_commas - 1
+                            if num_commas > 0:  # for say 3 commas
+                                curr_comma_position += (
+                                        curr_line[curr_comma_position + 1:].index(symbol)
+                                        + 1
+                                )
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                f.write(curr_line)
+                    else:
+                        f.write(curr_line)
+            else:
+                f.write(line)
+
+
diff --git a/src/iranlowo/utils.py b/src/iranlowo/utils.py
new file mode 100644
index 0000000..a861d32
--- /dev/null
+++ b/src/iranlowo/utils.py
@@ -0,0 +1,136 @@
+import re
+import unicodedata
+from collections import defaultdict
+
+from iranlowo.preprocessing import strip_accents_text
+
+
+def is_file_nfc(path):
+    """
+
+    Args:
+        path: File path
+
+    Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct
+
+    """
+    text = open(path).read()
+    return is_text_nfc(text)
+
+
+def is_text_nfc(text):
+    """Validate unicode form of given text"""
+    nfc_text = "".join(c for c in unicodedata.normalize("NFC", text))
+    if nfc_text == text:
+        return True
+    else:
+        return False
+
+
+def file_info(filename):
+    """File metadata useful for various ADR tasks"""
+
+    print("\nFilename: " + filename)
+    print("---------------------------------")
+
+    lines = tuple(open(filename, "r", encoding="utf-8"))
+    num_utts = len(lines)
+
+    text = "".join(
+        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
+    )
+    words = re.findall("\w+", text)
+    num_words = len(words)
+    num_chars = len(re.findall(r"\S", text))
+
+    unique_chars = set(text)
+    num_uniq_chars = len(unique_chars)
+
+    print(sorted(unique_chars))
+    print("# utts      : " + str(num_utts))
+    print("# chars     : " + str(num_chars))
+    print("# uniq chars: " + str(num_uniq_chars))
+
+    # unaccented word stats
+    unaccented_words = 0
+    for word in words:
+        if word == strip_accents_text(word):
+            unaccented_words += 1
+
+    print("# total words: " + str(num_words))
+    print("# unaccented words : " + str(unaccented_words))
+    print("-----------------------------------------------")
+
+    # ambiguous word stats
+    ambiguity_map = defaultdict(set)
+    for word in words:
+        no_accents = strip_accents_text(word)
+        ambiguity_map[no_accents].add(word)
+
+    ambiguous_words = 0
+    ambiguous_words_2 = 0
+    ambiguous_words_3 = 0
+    ambiguous_words_4 = 0
+    ambiguous_words_5 = 0
+    ambiguous_words_6 = 0
+    ambiguous_words_7 = 0
+    ambiguous_words_8 = 0
+    ambiguous_words_9 = 0
+
+    # fill ambiguity map
+    for word in ambiguity_map:
+        if len(ambiguity_map[word]) > 1:
+            ambiguous_words += 1
+        if len(ambiguity_map[word]) == 2:
+            ambiguous_words_2 += 1
+        elif len(ambiguity_map[word]) == 3:
+            ambiguous_words_3 += 1
+        elif len(ambiguity_map[word]) == 4:
+            ambiguous_words_4 += 1
+        elif len(ambiguity_map[word]) == 5:
+            ambiguous_words_5 += 1
+        elif len(ambiguity_map[word]) == 6:
+            ambiguous_words_6 += 1
+        elif len(ambiguity_map[word]) == 7:
+            ambiguous_words_7 += 1
+        elif len(ambiguity_map[word]) == 8:
+            ambiguous_words_8 += 1
+        elif len(ambiguity_map[word]) == 9:
+            ambiguous_words_9 += 1
+
+    # print ambiguity map
+    for word in ambiguity_map:
+        if len(ambiguity_map[word]) == 2:
+            print("# 2: " + str(ambiguity_map[word]))
+        if len(ambiguity_map[word]) == 3:
+            print("# 3: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 4:
+            print("# 4: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 5:
+            print("# 5: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 6:
+            print("# 6: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 7:
+            print("# 7: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 8:
+            print("# 8: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 9:
+            print("# 9: " + str(ambiguity_map[word]))
+
+    print("# unique ambiguous words : " + str(ambiguous_words))
+    print("# total unique non-diacritized words : " + str(len(ambiguity_map)))
+
+    unique_all_words = set()
+    for word in words:
+        unique_all_words.add(word)
+
+    print("# total unique words : " + str(len(unique_all_words)))
+    print("-----------------------------------------------")
+    print("# ambiguous 2 words : " + str(ambiguous_words_2))
+    print("# ambiguous 3 words : " + str(ambiguous_words_3))
+    print("# ambiguous 4 words : " + str(ambiguous_words_4))
+    print("# ambiguous 5 words : " + str(ambiguous_words_5))
+    print("# ambiguous 6 words : " + str(ambiguous_words_6))
+    print("# ambiguous 7 words : " + str(ambiguous_words_7))
+    print("# ambiguous 8 words : " + str(ambiguous_words_8))
+    print("# ambiguous 9 words : " + str(ambiguous_words_9))
diff --git a/tests/test_adr.py b/tests/test_adr.py
index 7dc30de..8bd618d 100644
--- a/tests/test_adr.py
+++ b/tests/test_adr.py
@@ -1,18 +1,22 @@
 # -*- coding: utf-8 -*-
 
 import filecmp
-import iranlowo.adr as ránlọ
 import os
 
+from iranlowo.adr import diacritize_text
+
+from iranlowo.preprocessing import strip_accents_text, strip_accents_file, normalize_diacritics_file
+from iranlowo.utils import is_file_nfc, is_text_nfc, file_info
+
 
 def test_strip_accents_text():
     ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889"
     yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà"
     yo_1 = "Kí ó tó di ààrẹ"
 
-    assert ránlọ.strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889"
-    assert ránlọ.strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika"
-    assert ránlọ.strip_accents_text(yo_1) == "Ki o to di aare"
+    assert strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889"
+    assert strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika"
+    assert strip_accents_text(yo_1) == "Ki o to di aare"
 
 
 def test_strip_accents_file():
@@ -21,7 +25,7 @@ def test_strip_accents_file():
     reference_stripped_filepath = cwd + "/tests/testdata/ref_proccessed_file.txt"
     processed_stripped_filepath = cwd + "/tests/testdata/processed_file.txt"
 
-    assert(ránlọ.strip_accents_file(src_filepath, processed_stripped_filepath) is True)  # job completed
+    assert(strip_accents_file(src_filepath, processed_stripped_filepath) is True)  # job completed
     assert(filecmp.cmp(src_filepath, processed_stripped_filepath) is False)         # src & processed are different
     assert(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))   # processed matches reference
 
@@ -30,17 +34,17 @@ def test_is_file_nfc():
     cwd = os.getcwd()
     src_filepath_pass = cwd + "/tests/testdata/nfc.txt"
     src_filepath_fail = cwd + "/tests/testdata/nfc_fail.txt"
-    assert (ránlọ.is_file_nfc(src_filepath_pass) is True)
-    assert (ránlọ.is_file_nfc(src_filepath_fail) is False)
+    assert (is_file_nfc(src_filepath_pass) is True)
+    assert (is_file_nfc(src_filepath_fail) is False)
 
 
 def test_is_text_nfc():
-    assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False)  # NFD
-    assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True)   # NFC
+    assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False)  # NFD
+    assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True)   # NFC
     
     # cover diacritics that have both accents and underdots
-    assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False)  # NFD
-    assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True)   # NFC
+    assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False)  # NFD
+    assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True)   # NFC
 
 
 def test_normalize_diacritics_file():
@@ -49,7 +53,7 @@ def test_normalize_diacritics_file():
     reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt"
     processed_nfc_filepath = cwd + "/tests/testdata/processed_nfc.txt"
 
-    assert(ránlọ.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True)  # job completed
+    assert(normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True)  # job completed
     assert(filecmp.cmp(nfd_filepath, processed_nfc_filepath) is False)              # src & processed are different
     assert(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath) is True)     # processed matches reference
 
@@ -57,7 +61,7 @@ def test_normalize_diacritics_file():
 def test_file_info():
     cwd = os.getcwd()
     reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt"
-    ránlọ.file_info(reference_nfc_filepath)
+    file_info(reference_nfc_filepath)
 
     # reference_nfc_filepath
 
@@ -67,7 +71,7 @@ def test_file_info():
 #     reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt"
 #     processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt"
 #
-#     assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath,
+#     assert(split_out_corpus_on_symbol(multiline_filepath,
 #                                                  reference_multiline_split_filepath, ',') is True)  # job completed
 #     assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False)              # src & processed are different
 #     assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True)     # processed matches reference
@@ -77,51 +81,51 @@ def test_file_info():
 
 def test_diacritize_text():
 
-    predictions = ránlọ.diacritize_text("leyin igba naa")
+    predictions = diacritize_text("leyin igba naa")
     assert(predictions == "lẹ́yìn ìgbà náà")   # generated matches reference
     assert(predictions != "lẹ́yìn igbà náà")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("obinrin")
+    predictions = diacritize_text("obinrin")
     assert(predictions == "obìnrin")   # generated matches reference
     assert(predictions != "obinrin")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("okunrin")
+    predictions = diacritize_text("okunrin")
     assert(predictions == "ọkùnrin")   # generated matches reference
     assert(predictions != "ọkunrin")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("orisirisi")
+    predictions = diacritize_text("orisirisi")
     assert(predictions == "oríṣiríṣi")   # generated matches reference
     assert(predictions != "orísiríṣi")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("nitori naa")
+    predictions = diacritize_text("nitori naa")
     assert(predictions == "nítorí náà")   # generated matches reference
     assert(predictions != "nitorí náà")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo")
+    predictions = diacritize_text("leyin oro mi won ko tun soro mo")
     assert(predictions == "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́")   # generated matches reference
     assert(predictions != "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́")   # generated does not match incorrect reference
 
-    # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode")
+    # predictions = diacritize_text("awon okunrin nse ise agbara bi ise ode")
     # assert(predictions == "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated matches reference
     # assert(predictions != "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("ati beebee lo")
+    predictions = diacritize_text("ati beebee lo")
     assert(predictions == "àti bẹ́ẹ̀bẹ́ẹ̀ lọ")   # generated matches reference
     assert(predictions != "ati bẹ́ẹ̀bẹ́ẹ̀ lọ")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari")
+    predictions = diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari")
     assert(predictions == "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí")   # generated matches reference
     assert(predictions != "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí")   # generated does not match incorrect reference
 
-    # predictions = ránlọ.diacritize_text("bi ase nlo yii")
+    # predictions = diacritize_text("bi ase nlo yii")
     # assert(predictions == "bí aṣe ńlọ yìí")   # generated matches reference
     # assert(predictions != "bí ase ńlọ yìí")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("o dabi pe")
+    predictions = diacritize_text("o dabi pe")
     assert(predictions == "ó dàbí pé")   # generated matches reference
     assert(predictions != "ó dàbí pe")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("sugbon")
+    predictions = diacritize_text("sugbon")
     assert(predictions == "ṣùgbọ́n")   # generated matches reference
     assert(predictions != "ṣugbọ́n")   # generated does not match incorrect reference
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
new file mode 100644
index 0000000..e69de29

From 741d9093d60c1cd19c36783d8d865e0ae1b51cc4 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sat, 13 Jul 2019 11:44:36 +0100
Subject: [PATCH 2/6] OOP Testing and Corpus Module

---
 requirements.txt                  |   1 +
 src/iranlowo/corpus/__init__.py   |   1 +
 src/iranlowo/corpus/bbc_yoruba.py |  13 ++
 src/iranlowo/corpus/bibeli.py     |  12 ++
 src/iranlowo/corpus/corpus.py     |  55 +++------
 src/iranlowo/corpus/scrapper.py   |   0
 src/iranlowo/corpus/yor_blog.py   |  12 ++
 src/iranlowo/interfaces.py        |  14 +++
 tests/pred.txt                    |   1 +
 tests/test_adr.py                 | 198 ++++++++++++++----------------
 tests/test_corpus.py              |  54 ++++++++
 tests/test_preprocessing.py       |  11 +-
 tests/utils.py                    |   7 ++
 13 files changed, 235 insertions(+), 144 deletions(-)
 create mode 100644 src/iranlowo/corpus/bbc_yoruba.py
 create mode 100644 src/iranlowo/corpus/bibeli.py
 create mode 100644 src/iranlowo/corpus/scrapper.py
 create mode 100644 src/iranlowo/corpus/yor_blog.py
 create mode 100644 src/iranlowo/interfaces.py
 create mode 100644 tests/pred.txt
 create mode 100644 tests/utils.py

diff --git a/requirements.txt b/requirements.txt
index 138f082..cdf04af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+gensim
 bs4
 configargparse
 torch
diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
index e69de29..fc5a0d8 100644
--- a/src/iranlowo/corpus/__init__.py
+++ b/src/iranlowo/corpus/__init__.py
@@ -0,0 +1 @@
+from .corpus import Corpus, DirectoryCorpus
diff --git a/src/iranlowo/corpus/bbc_yoruba.py b/src/iranlowo/corpus/bbc_yoruba.py
new file mode 100644
index 0000000..004d6da
--- /dev/null
+++ b/src/iranlowo/corpus/bbc_yoruba.py
@@ -0,0 +1,13 @@
+from iranlowo.corpus import Corpus
+
+
+class BBCCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
+        super().__init__(path)
+
diff --git a/src/iranlowo/corpus/bibeli.py b/src/iranlowo/corpus/bibeli.py
new file mode 100644
index 0000000..378b029
--- /dev/null
+++ b/src/iranlowo/corpus/bibeli.py
@@ -0,0 +1,12 @@
+from iranlowo.corpus import Corpus
+
+
+class BibeliCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
+
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index cb47cc5..4e20a42 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -2,36 +2,33 @@
 import os
 import sys
 
-import requests
 
 from gensim import interfaces
 from gensim.corpora.csvcorpus import CsvCorpus
-from gensim.corpora.textcorpus import lower_to_unicode, strip_multiple_whitespaces, walk
-from gensim.utils import deaccent
+from gensim.corpora.textcorpus import walk
 
-from iranlowo.preprocessing import is_valid_owé_format
+from iranlowo.preprocessing import is_valid_owé_format, normalize_diacritics_text
 from iranlowo.utils import is_text_nfc
 
 
 class Corpus(interfaces.CorpusABC):
-    def __init__(self, path=None, text=None, is_url=False, rlist=False, stream=False, fformat='txt', cformat=None, labels=False, preprocess=False):
+    def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None):
         """
 
         Args:
             path:
             text:
-            **kwargs:
         """
         self.path = path
         self.text = text
-        self.rlist = rlist
         self.labels = labels
         self.stream = stream
         self.fformat = fformat
-        self.preprocess = preprocess
         self.cformat = cformat
-        self.is_url = is_url
-        self.data = text if text else self.read_file_or_filename()
+        self.preprocess = preprocess
+        if not self.preprocess:
+            self.preprocess = [normalize_diacritics_text]
+        self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
         self.validate_format()
 
     def __iter__(self):
@@ -41,9 +38,6 @@ def __iter__(self):
     def __len__(self):
         return len(self.data)
 
-    def get_data(self):
-        pass
-
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
         pass
@@ -55,29 +49,24 @@ def streamfile(self, fobj):
                 num_text += 1
                 yield line
 
-    def read_file_or_filename(self, f=None):
+    def read_file_filename_or_text(self, f=None, text=None):
         """
 
         Returns:
 
         """
         path = f if f else self.path
-        text = None
-        print(len(self.path))
         out = []
-        if isinstance(path, list):
+        if text:
+            return self.handle_preprocessing(text) if self.preprocess else text
+        elif isinstance(path, list):
             for f in path:
                 path.remove(f)
                 sys.setrecursionlimit(10000)
-                text = self.read_file_or_filename(f)
+                text = self.read_file_filename_or_text(f)
                 out.append(text)
         else:
-            if self.is_url:
-                r = requests.get(path)
-                if r.status_code in [200, 201]:
-                    text = r.text
-                    return text
-            elif isinstance(path, str):
+            if isinstance(path, str):
                 if self.fformat == "txt":
                     text = open(path)
                 elif self.fformat == "csv":
@@ -87,22 +76,15 @@ def read_file_or_filename(self, f=None):
             else:
                 text = self.path.seek(0)
 
-            if not self.stream:
-                text = text.read() if not self.rlist else text.readlines()
-                print(text)
-                if self.preprocess:
-                    text = self.handle_preprocessing(text)
-                return text
-            else:
-                self.streamfile(text)
+            text = text.read() if not self.stream else ''.join(list(self.streamfile(text)))
+            return self.handle_preprocessing(text) if self.preprocess else text
 
     def handle_preprocessing(self, text):
         if callable(self.preprocess):
             return self.preprocess(text)
         if isinstance(self.preprocess, list):
-            prep_list = self.preprocess if isinstance(self.preprocess, list) else [lower_to_unicode, deaccent, strip_multiple_whitespaces]
-            for technique in prep_list:
-                text = technique(self.data)
+            for technique in self.preprocess:
+                text = technique(text)
             return text
 
     def validate_format(self):
@@ -113,7 +95,7 @@ def validate_format(self):
         """
         data = self.data
         if isinstance(data, list):
-            data = ' '.join(data)
+            data = ''.join(data)
         if not self.cformat and not is_text_nfc(data):
             raise TypeError("The corpus does not comply to the NFC corpus format")
         elif self.cformat == "owe":
@@ -148,4 +130,3 @@ def __init__(self, path, **kwargs):
     def read_files(self):
         for path in self.flist:
             yield os.path.join(self.path_dir, path)
-
diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/iranlowo/corpus/yor_blog.py b/src/iranlowo/corpus/yor_blog.py
new file mode 100644
index 0000000..173b8f5
--- /dev/null
+++ b/src/iranlowo/corpus/yor_blog.py
@@ -0,0 +1,12 @@
+from iranlowo.corpus import Corpus
+
+
+class YorubaBlogCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
+
diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py
new file mode 100644
index 0000000..5ba518e
--- /dev/null
+++ b/src/iranlowo/interfaces.py
@@ -0,0 +1,14 @@
+import scrapy
+
+
+class Scrapper(scrapy.Spider):
+    """
+    Interface for scrapping data from :mod:`iranlowo.scrapper`
+    """
+
+    def __init__(self, name, urls, **kwargs):
+        super(Scrapper, self).__init__(name, **kwargs)
+
+    def parse(self, response):
+        pass
+
diff --git a/tests/pred.txt b/tests/pred.txt
new file mode 100644
index 0000000..fa84c75
--- /dev/null
+++ b/tests/pred.txt
@@ -0,0 +1 @@
+ṣùgbọ́n
diff --git a/tests/test_adr.py b/tests/test_adr.py
index 8bd618d..7c9c646 100644
--- a/tests/test_adr.py
+++ b/tests/test_adr.py
@@ -1,131 +1,123 @@
 # -*- coding: utf-8 -*-
 
 import filecmp
-import os
+import iranlowo.adr as ránlọ
+from iranlowo import utils
+from iranlowo import preprocessing
+import unittest
+
+from tests.utils import datapath
 
-from iranlowo.adr import diacritize_text
 
-from iranlowo.preprocessing import strip_accents_text, strip_accents_file, normalize_diacritics_file
-from iranlowo.utils import is_file_nfc, is_text_nfc, file_info
+class IranlowoADRTest(unittest.TestCase):
+
+    def test_strip_accents_text(self):
+        ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889"
+        yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà"
+        yo_1 = "Kí ó tó di ààrẹ"
+
+        self.assertEqual(utils.strip_accents_text(ca_fr), "Montreal, uber, 12.89, Mere, Francoise, noel, 889")
+        self.assertEqual(utils.strip_accents_text(yo_0), "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika")
+        self.assertEqual(utils.strip_accents_text(yo_1), "Ki o to di aare")
 
+    def test_strip_accents_file(self):
+        src_filepath = datapath('src_file.txt')
+        reference_stripped_filepath = datapath('ref_proccessed_file.txt')
+        processed_stripped_filepath = datapath('processed_file.txt')
 
-def test_strip_accents_text():
-    ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889"
-    yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà"
-    yo_1 = "Kí ó tó di ààrẹ"
+        self.assertTrue(preprocessing.strip_accents_file(src_filepath, processed_stripped_filepath))
+        self.assertFalse(filecmp.cmp(src_filepath, processed_stripped_filepath))
+        self.assertTrue(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))
 
-    assert strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889"
-    assert strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika"
-    assert strip_accents_text(yo_1) == "Ki o to di aare"
+    def test_is_file_nfc(self):
+        src_filepath_pass = datapath('nfc.txt')
+        src_filepath_fail = datapath('nfc_fail.txt')
 
+        self.assertTrue(utils.is_file_nfc(src_filepath_pass))
+        self.assertFalse(utils.is_file_nfc(src_filepath_fail))
 
-def test_strip_accents_file():
-    cwd = os.getcwd()
-    src_filepath = cwd + "/tests/testdata/src_file.txt"
-    reference_stripped_filepath = cwd + "/tests/testdata/ref_proccessed_file.txt"
-    processed_stripped_filepath = cwd + "/tests/testdata/processed_file.txt"
+    def test_is_text_nfc(self):
+        self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))
+        self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))
 
-    assert(strip_accents_file(src_filepath, processed_stripped_filepath) is True)  # job completed
-    assert(filecmp.cmp(src_filepath, processed_stripped_filepath) is False)         # src & processed are different
-    assert(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))   # processed matches reference
+        self.assertTrue(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è'))
+        self.assertFalse(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è'))
 
+    def test_normalize_diacritics_file(self):
+        nfd_filepath = datapath('nfd.txt')
+        reference_nfc_filepath = datapath('nfc.txt')
+        processed_nfc_filepath = datapath('processed_nfc.txt')
 
-def test_is_file_nfc():
-    cwd = os.getcwd()
-    src_filepath_pass = cwd + "/tests/testdata/nfc.txt"
-    src_filepath_fail = cwd + "/tests/testdata/nfc_fail.txt"
-    assert (is_file_nfc(src_filepath_pass) is True)
-    assert (is_file_nfc(src_filepath_fail) is False)
+        self.assertTrue(preprocessing.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath))
+        self.assertFalse(filecmp.cmp(nfd_filepath, processed_nfc_filepath))  # src & processed are different
+        self.assertTrue(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath))  # processed matches reference
 
+    def test_file_info(self):
+        reference_nfc_filepath = datapath('nfc.txt')
+        utils.file_info(reference_nfc_filepath)
 
-def test_is_text_nfc():
-    assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False)  # NFD
-    assert(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True)   # NFC
-    
-    # cover diacritics that have both accents and underdots
-    assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False)  # NFD
-    assert(is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True)   # NFC
+        # reference_nfc_filepath
 
+    # def test_split_corpus_on_symbol(self):
+    #     cwd = os.getcwd()
+    #     multiline_filepath = "/tests/testdata/multiline.txt"
+    #     reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt"
+    #     processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt"
+    #
+    #     assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath,
+    #                                                  reference_multiline_split_filepath, ',') is True)  # job completed
+    #     assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False)              # src & processed are different
+    #     assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True)     # processed matches reference
+    #
+    #     # try different punctuation ',', ':', etc?
 
-def test_normalize_diacritics_file():
-    cwd = os.getcwd()
-    nfd_filepath = cwd + "/tests/testdata/nfd.txt"
-    reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt"
-    processed_nfc_filepath = cwd + "/tests/testdata/processed_nfc.txt"
+    def test_diacritize_text(self):
+        predictions = ránlọ.diacritize_text("leyin igba naa")
+        self.assertEqual(predictions, "lẹ́yìn ìgbà náà")  # generated matches reference
+        self.assertNotEqual(predictions, "lẹ́yìn igbà náà")  # generated does not match incorrect reference
 
-    assert(normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True)  # job completed
-    assert(filecmp.cmp(nfd_filepath, processed_nfc_filepath) is False)              # src & processed are different
-    assert(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath) is True)     # processed matches reference
+        predictions = ránlọ.diacritize_text("obinrin")
+        self.assertEqual(predictions, "obìnrin")  # generated matches reference
+        self.assertNotEqual(predictions, "obinrin")  # generated does not match incorrect reference
 
+        predictions = ránlọ.diacritize_text("okunrin")
+        self.assertEqual(predictions, "ọkùnrin")  # generated matches reference
+        self.assertNotEqual(predictions, "ọkunrin")  # generated does not match incorrect reference
 
-def test_file_info():
-    cwd = os.getcwd()
-    reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt"
-    file_info(reference_nfc_filepath)
+        predictions = ránlọ.diacritize_text("orisirisi")
+        self.assertEqual(predictions, "oríṣiríṣi")  # generated matches reference
+        self.assertNotEqual(predictions, "orísiríṣi")  # generated does not match incorrect reference
 
-    # reference_nfc_filepath
+        predictions = ránlọ.diacritize_text("nitori naa")
+        self.assertEqual(predictions, "nítorí náà")  # generated matches reference
+        self.assertNotEqual(predictions, "nitorí náà")  # generated does not match incorrect reference
 
-# def test_split_corpus_on_symbol():
-#     cwd = os.getcwd()
-#     multiline_filepath = "/tests/testdata/multiline.txt"
-#     reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt"
-#     processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt"
-#
-#     assert(split_out_corpus_on_symbol(multiline_filepath,
-#                                                  reference_multiline_split_filepath, ',') is True)  # job completed
-#     assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False)              # src & processed are different
-#     assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True)     # processed matches reference
-#
-#     # try different punctuation ',', ':', etc?
+        predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo")
+        self.assertEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́")  # generated matches reference
+        self.assertNotEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́")  # generated does not match incorrect reference
 
+        # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode")
+        # assert(predictions , "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated matches reference
+        # assert(predictions , "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated does not match incorrect reference
 
-def test_diacritize_text():
+        predictions = ránlọ.diacritize_text("ati beebee lo")
+        self.assertEqual(predictions, "àti bẹ́ẹ̀bẹ́ẹ̀ lọ")  # generated matches reference
+        self.assertNotEqual(predictions, "ati bẹ́ẹ̀bẹ́ẹ̀ lọ")  # generated does not match incorrect reference
 
-    predictions = diacritize_text("leyin igba naa")
-    assert(predictions == "lẹ́yìn ìgbà náà")   # generated matches reference
-    assert(predictions != "lẹ́yìn igbà náà")   # generated does not match incorrect reference
+        predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari")
+        self.assertEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí")  # generated matches reference
+        self.assertNotEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí")  # generated does not match incorrect reference
 
-    predictions = diacritize_text("obinrin")
-    assert(predictions == "obìnrin")   # generated matches reference
-    assert(predictions != "obinrin")   # generated does not match incorrect reference
+        # predictions = ránlọ.diacritize_text("bi ase nlo yii")
+        # assert(predictions , "bí aṣe ńlọ yìí")   # generated matches reference
+        # assert(predictions , "bí ase ńlọ yìí")   # generated does not match incorrect reference
+
+        predictions = ránlọ.diacritize_text("o dabi pe")
+        self.assertEqual(predictions, "ó dàbí pé")  # generated matches reference
+        self.assertNotEqual(predictions, "ó dàbí pe")  # generated does not match incorrect reference
+
+        predictions = ránlọ.diacritize_text("sugbon")
+        self.assertEqual(predictions, "ṣùgbọ́n")  # generated matches reference
+        self.assertNotEqual(predictions, "ṣugbọ́n")  # generated does not match incorrect reference
 
-    predictions = diacritize_text("okunrin")
-    assert(predictions == "ọkùnrin")   # generated matches reference
-    assert(predictions != "ọkunrin")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("orisirisi")
-    assert(predictions == "oríṣiríṣi")   # generated matches reference
-    assert(predictions != "orísiríṣi")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("nitori naa")
-    assert(predictions == "nítorí náà")   # generated matches reference
-    assert(predictions != "nitorí náà")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("leyin oro mi won ko tun soro mo")
-    assert(predictions == "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́")   # generated matches reference
-    assert(predictions != "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́")   # generated does not match incorrect reference
-
-    # predictions = diacritize_text("awon okunrin nse ise agbara bi ise ode")
-    # assert(predictions == "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated matches reference
-    # assert(predictions != "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("ati beebee lo")
-    assert(predictions == "àti bẹ́ẹ̀bẹ́ẹ̀ lọ")   # generated matches reference
-    assert(predictions != "ati bẹ́ẹ̀bẹ́ẹ̀ lọ")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari")
-    assert(predictions == "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí")   # generated matches reference
-    assert(predictions != "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí")   # generated does not match incorrect reference
-
-    # predictions = diacritize_text("bi ase nlo yii")
-    # assert(predictions == "bí aṣe ńlọ yìí")   # generated matches reference
-    # assert(predictions != "bí ase ńlọ yìí")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("o dabi pe")
-    assert(predictions == "ó dàbí pé")   # generated matches reference
-    assert(predictions != "ó dàbí pe")   # generated does not match incorrect reference
-
-    predictions = diacritize_text("sugbon")
-    assert(predictions == "ṣùgbọ́n")   # generated matches reference
-    assert(predictions != "ṣugbọ́n")   # generated does not match incorrect reference
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index e69de29..73fd4db 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -0,0 +1,54 @@
+import string
+import unittest
+
+from iranlowo import corpus
+from tests.utils import datapath
+
+
+class TestTextCorpus(unittest.TestCase):
+    def setUp(self):
+        self.corpus_class = corpus.Corpus
+        self.txt_extension = 'txt'
+        self.csv_extension = 'csv'
+        self.gzip_extension = 'gzip'
+
+    def test_load_corpus_from_path(self):
+        path = datapath('owe_pass')
+        corpus = self.corpus_class(path=path, fformat=self.txt_extension)
+        self.assertEqual(len(corpus), 420)
+
+    def test_load_corpus_from_path_stream(self):
+        path = datapath('owe_pass')
+        corpus = self.corpus_class(path=path, fformat=self.txt_extension, stream=True)
+        self.assertEqual(len(corpus), 420)
+
+    def test_load_corpus_from_text(self):
+        text = open(datapath('owe_pass')).read()
+        corpus = self.corpus_class(text=text)
+        self.assertEqual(len(corpus), 420)
+
+    def test_load_corpus_with_preprocessing(self):
+        lines = [
+            "Àwọn obìnrin, wọn ní kiní agbára yẹn lórí àwọn ọkùnrin?",
+            "Ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?",
+            "eranko wo lo buru julo"
+        ]
+        expected = [
+            'Àwọn obìnrin wọn ní kiní agbára yẹn lórí àwọn ọkùnrin',
+            "ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?",
+            'erankowoloburujulo'
+        ]
+
+        def punctuations(text): return text.translate(str.maketrans('', '', string.punctuation))
+
+        preprocessing = [
+            lambda x: punctuations(x), lambda x: x.lower(), lambda x: x.replace(' ', '')
+        ]
+
+        for index, entry in enumerate(lines):
+            corpus = self.corpus_class(text=entry, preprocess=preprocessing[index])
+            self.assertEqual(corpus.data, expected[index])
+
+    def test_save(self):
+        pass
+
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 3203e18..f172586 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -1,10 +1,13 @@
 import os
+import unittest
 
 from iranlowo import preprocessing
 
 
-def test_is_valid_owe_format():
-    cwd = os.getcwd()
-    fail_path = cwd + "/tests/testdata/nfc.txt"
+class IranlowoCorpusTest(unittest.TestCase):
 
-    assert preprocessing.is_valid_owé_format(fail_path) is False
+    def test_is_valid_owe_format(self):
+        cwd = os.getcwd()
+        fail_path = cwd + "/tests/testdata/nfc.txt"
+
+        assert preprocessing.is_valid_owé_format(fail_path) is False
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..a152c24
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,7 @@
+import os
+
+module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
+
+
+def datapath(fname):
+    return os.path.join(module_path, 'testdata', fname)

From bff1c65765b67a6afd4ee983800734911fec8f42 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sat, 13 Jul 2019 11:47:12 +0100
Subject: [PATCH 3/6] Added preprocessing test

---
 tests/test_preprocessing.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index f172586..eb10b1b 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -2,12 +2,11 @@
 import unittest
 
 from iranlowo import preprocessing
+from tests.utils import datapath
 
 
 class IranlowoCorpusTest(unittest.TestCase):
 
     def test_is_valid_owe_format(self):
-        cwd = os.getcwd()
-        fail_path = cwd + "/tests/testdata/nfc.txt"
-
-        assert preprocessing.is_valid_owé_format(fail_path) is False
+        fail_path = datapath('nfc.txt')
+        self.assertFalse(preprocessing.is_valid_owé_format(fail_path))

From a07cb289f3961c73e31a51c410adc3fe2f7a578f Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Tue, 16 Jul 2019 00:05:37 +0100
Subject: [PATCH 4/6] Introducing Corpus Module

---
 src/iranlowo/corpus/__init__.py   |  1 +
 src/iranlowo/corpus/bbc_yoruba.py | 13 ------
 src/iranlowo/corpus/bibeli.py     | 12 ------
 src/iranlowo/corpus/corpus.py     | 22 +++++------
 src/iranlowo/corpus/loaders.py    | 66 +++++++++++++++++++++++++++++++
 src/iranlowo/corpus/yor_blog.py   | 12 ------
 src/iranlowo/utils.py             |  6 +++
 tests/test_corpus.py              | 15 +++++++
 tests/test_loaders.py             | 13 ++++++
 tests/test_preprocessing.py       |  1 -
 tests/testdata/dirdata/yo_000.txt |  1 +
 tests/testdata/dirdata/yo_001.txt |  1 +
 tests/testdata/dirdata/yo_002.txt |  1 +
 tests/testdata/dirdata/yo_003.txt |  1 +
 tests/testdata/dirdata/yo_004.txt |  1 +
 tests/testdata/dirdata/yo_005.txt |  1 +
 tests/testdata/dirdata/yo_006.txt |  1 +
 tests/testdata/dirdata/yo_007.txt |  1 +
 tests/testdata/dirdata/yo_008.txt |  1 +
 tests/utils.py                    |  4 +-
 20 files changed, 124 insertions(+), 50 deletions(-)
 delete mode 100644 src/iranlowo/corpus/bbc_yoruba.py
 delete mode 100644 src/iranlowo/corpus/bibeli.py
 create mode 100644 src/iranlowo/corpus/loaders.py
 delete mode 100644 src/iranlowo/corpus/yor_blog.py
 create mode 100644 tests/test_loaders.py
 create mode 100644 tests/testdata/dirdata/yo_000.txt
 create mode 100644 tests/testdata/dirdata/yo_001.txt
 create mode 100644 tests/testdata/dirdata/yo_002.txt
 create mode 100644 tests/testdata/dirdata/yo_003.txt
 create mode 100644 tests/testdata/dirdata/yo_004.txt
 create mode 100644 tests/testdata/dirdata/yo_005.txt
 create mode 100644 tests/testdata/dirdata/yo_006.txt
 create mode 100644 tests/testdata/dirdata/yo_007.txt
 create mode 100644 tests/testdata/dirdata/yo_008.txt

diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
index fc5a0d8..4201d4e 100644
--- a/src/iranlowo/corpus/__init__.py
+++ b/src/iranlowo/corpus/__init__.py
@@ -1 +1,2 @@
 from .corpus import Corpus, DirectoryCorpus
+from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus
\ No newline at end of file
diff --git a/src/iranlowo/corpus/bbc_yoruba.py b/src/iranlowo/corpus/bbc_yoruba.py
deleted file mode 100644
index 004d6da..0000000
--- a/src/iranlowo/corpus/bbc_yoruba.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from iranlowo.corpus import Corpus
-
-
-class BBCCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
-        super().__init__(path)
-
diff --git a/src/iranlowo/corpus/bibeli.py b/src/iranlowo/corpus/bibeli.py
deleted file mode 100644
index 378b029..0000000
--- a/src/iranlowo/corpus/bibeli.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from iranlowo.corpus import Corpus
-
-
-class BibeliCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
-
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index 4e20a42..81fa2ff 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -1,7 +1,5 @@
 import gzip
 import os
-import sys
-
 
 from gensim import interfaces
 from gensim.corpora.csvcorpus import CsvCorpus
@@ -26,6 +24,7 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No
         self.fformat = fformat
         self.cformat = cformat
         self.preprocess = preprocess
+        assert self.path or self.text, "You should pass either a path or text to read data from."
         if not self.preprocess:
             self.preprocess = [normalize_diacritics_text]
         self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
@@ -61,10 +60,9 @@ def read_file_filename_or_text(self, f=None, text=None):
             return self.handle_preprocessing(text) if self.preprocess else text
         elif isinstance(path, list):
             for f in path:
-                path.remove(f)
-                sys.setrecursionlimit(10000)
                 text = self.read_file_filename_or_text(f)
                 out.append(text)
+            return out
         else:
             if isinstance(path, str):
                 if self.fformat == "txt":
@@ -119,14 +117,16 @@ def generate(self, size):
 
 class DirectoryCorpus(Corpus):
     def __init__(self, path, **kwargs):
-        self.path_dir = path
-        walked = list(walk(self.path_dir))
-        self.depth = walked[0][0]
-        self.dirnames = walked[0][2]
-        self.flist = walked[0][3]
+        self.dir_path = path
+        self.depth = kwargs.get('min_depth', 0)
         self.path = list(self.read_files())
         super(DirectoryCorpus, self).__init__(path=self.path, **kwargs)
 
     def read_files(self):
-        for path in self.flist:
-            yield os.path.join(self.path_dir, path)
+        walked = list(walk(self.dir_path))
+        if not walked:
+            raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path))
+        for depth, dirpath, _, filenames in walked:
+            if self.depth <= depth:
+                for path in filenames:
+                    yield os.path.join(dirpath, path)
diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
new file mode 100644
index 0000000..1314af6
--- /dev/null
+++ b/src/iranlowo/corpus/loaders.py
@@ -0,0 +1,66 @@
+import os
+
+from iranlowo.corpus import Corpus, DirectoryCorpus
+
+
+class BaseLoader(object):
+    def __init__(self, corpus_path):
+        self.corpus_path = corpus_path
+        yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None)
+        if not yoruba_text_path:
+            raise NotADirectoryError(
+                "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's "
+                "path")
+        else:
+            corpus_path = "{}/{}".format(yoruba_text_path, corpus_path)
+            self.path = corpus_path
+
+
+class YorubaBlogCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
+
+
+class BBCCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
+        super().__init__(path)
+
+
+class BibeliCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
+
+
+class en(BaseLoader, DirectoryCorpus):
+    def __init__(self):
+        BaseLoader.__init__(self, corpus_path="Owe/en")
+        DirectoryCorpus.__init__(self, path=self.path)
+
+
+class yo(BaseLoader, DirectoryCorpus):
+    def __init__(self):
+        BaseLoader.__init__(self, corpus_path="Owe/yo")
+        DirectoryCorpus.__init__(self, path=self.path)
+
+
+class OweLoader(object):
+    def __init__(self):
+        self.en = en()
+        self.yo = yo()
+
diff --git a/src/iranlowo/corpus/yor_blog.py b/src/iranlowo/corpus/yor_blog.py
deleted file mode 100644
index 173b8f5..0000000
--- a/src/iranlowo/corpus/yor_blog.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from iranlowo.corpus import Corpus
-
-
-class YorubaBlogCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
-
diff --git a/src/iranlowo/utils.py b/src/iranlowo/utils.py
index a861d32..f4ab2c5 100644
--- a/src/iranlowo/utils.py
+++ b/src/iranlowo/utils.py
@@ -2,6 +2,8 @@
 import unicodedata
 from collections import defaultdict
 
+from pathlib import Path
+
 from iranlowo.preprocessing import strip_accents_text
 
 
@@ -27,6 +29,10 @@ def is_text_nfc(text):
         return False
 
 
+def string_to_path(string):
+    return Path(string)
+
+
 def file_info(filename):
     """File metadata useful for various ADR tasks"""
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index 73fd4db..4ebc36e 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -1,5 +1,6 @@
 import string
 import unittest
+from pathlib import Path
 
 from iranlowo import corpus
 from tests.utils import datapath
@@ -8,6 +9,7 @@
 class TestTextCorpus(unittest.TestCase):
     def setUp(self):
         self.corpus_class = corpus.Corpus
+        self.directory_loader = corpus.DirectoryCorpus
         self.txt_extension = 'txt'
         self.csv_extension = 'csv'
         self.gzip_extension = 'gzip'
@@ -49,6 +51,19 @@ def punctuations(text): return text.translate(str.maketrans('', '', string.punct
             corpus = self.corpus_class(text=entry, preprocess=preprocessing[index])
             self.assertEqual(corpus.data, expected[index])
 
+    def test_load_corpus_from_directory(self):
+        direc = datapath('dirdata')
+        invalid_dir = datapath('test_data')
+        multi_dir = datapath()
+        path = Path(direc).glob('*')
+        dir_corpus = self.directory_loader(path=direc)
+        self.assertEqual(len(dir_corpus.data), len(list(path)))
+        with self.assertRaises(NotADirectoryError):
+            self.directory_loader(path=invalid_dir)
+        multi_corp = self.directory_loader(path=multi_dir)
+        multi_path = Path(multi_dir).glob('**/*')
+        self.assertEqual(len(multi_corp.data), len(list(multi_path))-1)
+
     def test_save(self):
         pass
 
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
new file mode 100644
index 0000000..beffa3e
--- /dev/null
+++ b/tests/test_loaders.py
@@ -0,0 +1,13 @@
+import unittest
+import os
+
+from iranlowo import corpus
+
+
+class TestCoprusLoader(unittest.TestCase):
+    def setUp(self):
+        self.owe_loader = corpus.OweLoader
+
+    def test_load_owe(self):
+        with self.assertRaises(NotADirectoryError):
+            self.owe_loader()
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index eb10b1b..84dc379 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -1,4 +1,3 @@
-import os
 import unittest
 
 from iranlowo import preprocessing
diff --git a/tests/testdata/dirdata/yo_000.txt b/tests/testdata/dirdata/yo_000.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/tests/testdata/dirdata/yo_000.txt
@@ -0,0 +1 @@
+
diff --git a/tests/testdata/dirdata/yo_001.txt b/tests/testdata/dirdata/yo_001.txt
new file mode 100644
index 0000000..ab97739
--- /dev/null
+++ b/tests/testdata/dirdata/yo_001.txt
@@ -0,0 +1 @@
+A di gàárì sílẹ̀ ewúrẹ́ ńyọjú; ẹrù ìran rẹ̀ ni?
diff --git a/tests/testdata/dirdata/yo_002.txt b/tests/testdata/dirdata/yo_002.txt
new file mode 100644
index 0000000..87d7002
--- /dev/null
+++ b/tests/testdata/dirdata/yo_002.txt
@@ -0,0 +1 @@
+A fi ọ́ jọba ò ńṣàwúre o fẹ́ jẹ Ọlọ́run ni?
diff --git a/tests/testdata/dirdata/yo_003.txt b/tests/testdata/dirdata/yo_003.txt
new file mode 100644
index 0000000..822cd91
--- /dev/null
+++ b/tests/testdata/dirdata/yo_003.txt
@@ -0,0 +1 @@
+A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, bí a ò bá jà, bí a bá ti gba Awà, kò tán bí?
diff --git a/tests/testdata/dirdata/yo_004.txt b/tests/testdata/dirdata/yo_004.txt
new file mode 100644
index 0000000..c08f5e6
--- /dev/null
+++ b/tests/testdata/dirdata/yo_004.txt
@@ -0,0 +1 @@
+A gbé gàárì ọmọ ewurẹ ńrojú; kì í ṣe ẹrù àgùntàn.
diff --git a/tests/testdata/dirdata/yo_005.txt b/tests/testdata/dirdata/yo_005.txt
new file mode 100644
index 0000000..19a221e
--- /dev/null
+++ b/tests/testdata/dirdata/yo_005.txt
@@ -0,0 +1 @@
+A kì í bá ọba pàlà kí ọkọ́ ọba má ṣàn-ánni lẹ́sẹ̀.
diff --git a/tests/testdata/dirdata/yo_006.txt b/tests/testdata/dirdata/yo_006.txt
new file mode 100644
index 0000000..35113f5
--- /dev/null
+++ b/tests/testdata/dirdata/yo_006.txt
@@ -0,0 +1 @@
+A kì í bínú ààtàn ká dalẹ̀ sígbẹ̀ẹ́.
diff --git a/tests/testdata/dirdata/yo_007.txt b/tests/testdata/dirdata/yo_007.txt
new file mode 100644
index 0000000..695bae3
--- /dev/null
+++ b/tests/testdata/dirdata/yo_007.txt
@@ -0,0 +1 @@
+A kì í bínú orí ká fi fìlà dé ìbàdí.
diff --git a/tests/testdata/dirdata/yo_008.txt b/tests/testdata/dirdata/yo_008.txt
new file mode 100644
index 0000000..5123008
--- /dev/null
+++ b/tests/testdata/dirdata/yo_008.txt
@@ -0,0 +1 @@
+A kì í bẹ̀rù ikú bẹ̀rù àrùn ká ní kí ọmọ ó kú sinni.
diff --git a/tests/utils.py b/tests/utils.py
index a152c24..8f8ed14 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -3,5 +3,7 @@
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
 
 
-def datapath(fname):
+def datapath(fname=None):
+    if not fname:
+        return os.path.join(module_path, 'testdata')
     return os.path.join(module_path, 'testdata', fname)

From 7d891c49d2da323bd52706747beccdef4a7f9ab3 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Tue, 16 Jul 2019 09:44:29 +0100
Subject: [PATCH 5/6] Fixing text

---
 requirements.txt      | 3 ++-
 tests/test_loaders.py | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cdf04af..5441c7f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
-gensim
 bs4
 configargparse
 torch
 numpy
 requests
 tqdm
+google-compute-engine
+gensim
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
index beffa3e..7468a35 100644
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@@ -1,5 +1,4 @@
 import unittest
-import os
 
 from iranlowo import corpus
 

From f56b6f06e682a19e4c0e55a6a090d11e16f6f02c Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 21 Jul 2019 16:33:29 +0100
Subject: [PATCH 6/6] Checking if __init__ solves the failing tests.

---
 tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29