diff --git a/requirements.txt b/requirements.txt
index 138f082..5441c7f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@ torch
 numpy
 requests
 tqdm
+google-compute-engine
+gensim
diff --git a/src/iranlowo/adr.py b/src/iranlowo/adr.py
index 1012213..e57e730 100644
--- a/src/iranlowo/adr.py
+++ b/src/iranlowo/adr.py
@@ -4,272 +4,11 @@
 from __future__ import unicode_literals
 
 import pkg_resources
-import re
-import unicodedata
-
 from argparse import Namespace
-from collections import defaultdict
 from onmt.translate.translator import build_translator
 from onmt.utils.parse import ArgumentParser
 
 
-def strip_accents_text(text_string):
-    """
-    Converts the string to NFD, separates & returns only the base characters
-    :param text_string:
-    :return: input string without diacritic adornments on base characters
-    """
-    return "".join(
-        c
-        for c in unicodedata.normalize("NFD", text_string)
-        if unicodedata.category(c) != "Mn"
-    )
-
-
-def strip_accents_file(filename, outfilename):
-    """
-    Reads filename containing diacritics, converts to NFC for consistency,
-    then writes outfilename with diacritics removed
-    :param filename:
-    :param outfilename:
-    :return: None
-    """
-    text = "".join(
-        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
-    )
-    try:
-        f = open(outfilename, "w")
-    except EnvironmentError:
-        return False
-    else:
-        with f:
-            f.write(strip_accents_text(text))
-        return True
-
-
-def is_file_nfc(path):
-    """
-
-    Args:
-        path: File path
-
-    Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct
-
-    """
-    text = open(path).read()
-    return is_text_nfc(text)
-
-
-def is_text_nfc(text):
-    """Validate unicode form of given text"""
-    nfc_text = "".join(c for c in unicodedata.normalize("NFC", text))
-    if nfc_text == text:
-        return True
-    else:
-        return False
-
-
-def normalize_diacritics_text(text_string):
-    """Convenience wrapper to abstract away unicode & NFC"""
-    return unicodedata.normalize("NFC", text_string)
-
-
-def normalize_diacritics_file(filename, outfilename):
-    """File based Convenience wrapper to abstract away unicode & NFC"""
-    try:
-        text = "".join(
-            c
-            for c in unicodedata.normalize(
-                "NFC", open(filename, encoding="utf-8").read()
-            )
-        )
-        with open(outfilename, "w", encoding="utf-8") as f:
-            f.write(text)
-    except EnvironmentError:
-        return False
-    else:
-        return True
-
-
-def file_info(filename):
-    """File metadata useful for various ADR tasks"""
-
-    print("\nFilename: " + filename)
-    print("---------------------------------")
-
-    lines = tuple(open(filename, "r", encoding="utf-8"))
-    num_utts = len(lines)
-
-    text = "".join(
-        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
-    )
-    words = re.findall("\w+", text)
-    num_words = len(words)
-    num_chars = len(re.findall(r"\S", text))
-
-    unique_chars = set(text)
-    num_uniq_chars = len(unique_chars)
-
-    print(sorted(unique_chars))
-    print("# utts      : " + str(num_utts))
-    print("# chars     : " + str(num_chars))
-    print("# uniq chars: " + str(num_uniq_chars))
-
-    # unaccented word stats
-    unaccented_words = 0
-    for word in words:
-        if word == strip_accents_text(word):
-            unaccented_words += 1
-
-    print("# total words: " + str(num_words))
-    print("# unaccented words : " + str(unaccented_words))
-    print("-----------------------------------------------")
-
-    # ambiguous word stats
-    ambiguity_map = defaultdict(set)
-    for word in words:
-        no_accents = strip_accents_text(word)
-        ambiguity_map[no_accents].add(word)
-
-    ambiguous_words = 0
-    ambiguous_words_2 = 0
-    ambiguous_words_3 = 0
-    ambiguous_words_4 = 0
-    ambiguous_words_5 = 0
-    ambiguous_words_6 = 0
-    ambiguous_words_7 = 0
-    ambiguous_words_8 = 0
-    ambiguous_words_9 = 0
-
-    # fill ambiguity map
-    for word in ambiguity_map:
-        if len(ambiguity_map[word]) > 1:
-            ambiguous_words += 1
-        if len(ambiguity_map[word]) == 2:
-            ambiguous_words_2 += 1
-        elif len(ambiguity_map[word]) == 3:
-            ambiguous_words_3 += 1
-        elif len(ambiguity_map[word]) == 4:
-            ambiguous_words_4 += 1
-        elif len(ambiguity_map[word]) == 5:
-            ambiguous_words_5 += 1
-        elif len(ambiguity_map[word]) == 6:
-            ambiguous_words_6 += 1
-        elif len(ambiguity_map[word]) == 7:
-            ambiguous_words_7 += 1
-        elif len(ambiguity_map[word]) == 8:
-            ambiguous_words_8 += 1
-        elif len(ambiguity_map[word]) == 9:
-            ambiguous_words_9 += 1
-
-    # print ambiguity map
-    for word in ambiguity_map:
-        if len(ambiguity_map[word]) == 2:
-            print("# 2: " + str(ambiguity_map[word]))
-        if len(ambiguity_map[word]) == 3:
-            print("# 3: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 4:
-            print("# 4: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 5:
-            print("# 5: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 6:
-            print("# 6: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 7:
-            print("# 7: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 8:
-            print("# 8: " + str(ambiguity_map[word]))
-        elif len(ambiguity_map[word]) == 9:
-            print("# 9: " + str(ambiguity_map[word]))
-
-    print("# unique ambiguous words : " + str(ambiguous_words))
-    print("# total unique non-diacritized words : " + str(len(ambiguity_map)))
-
-    unique_all_words = set()
-    for word in words:
-        unique_all_words.add(word)
-
-    print("# total unique words : " + str(len(unique_all_words)))
-    print("-----------------------------------------------")
-    print("# ambiguous 2 words : " + str(ambiguous_words_2))
-    print("# ambiguous 3 words : " + str(ambiguous_words_3))
-    print("# ambiguous 4 words : " + str(ambiguous_words_4))
-    print("# ambiguous 5 words : " + str(ambiguous_words_5))
-    print("# ambiguous 6 words : " + str(ambiguous_words_6))
-    print("# ambiguous 7 words : " + str(ambiguous_words_7))
-    print("# ambiguous 8 words : " + str(ambiguous_words_8))
-    print("# ambiguous 9 words : " + str(ambiguous_words_9))
-
-
-def split_corpus_on_symbol(filename, outfilename, symbol=","):
-    """ 
-    For yoruba blog (and probably bibeli mimo)
-
-    Args: filenames for I/O and symbol to split lines on
-    Returns: writes outputfile
-    :param filename: input file
-    :param outfilename: processed output file to write
-    :param symbol: to split lines on
-    :return: None, with side-effect of writing an outputfile
-    """
-
-    lines = tuple(open(filename, "r", encoding="utf-8"))
-
-    min_words_to_split = 10
-    min_words_in_utt = 5
-
-    with open(outfilename, "w") as f:
-        # split out heavily comma'd text :((
-        for line in lines:
-            if symbol in line:
-                num_words = len(line.split())
-                num_commas = line.count(symbol)
-                curr_comma_position = line.index(symbol)
-                num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())
-
-                curr_line = line
-                while num_commas > 0:
-                    if num_words < min_words_to_split:
-                        # print(curr_line.strip())
-                        f.write(curr_line)
-                        break
-                    if num_words >= min_words_to_split:
-                        if (
-                                num_words_ahead_of_curr_comma >= min_words_in_utt
-                                and len((curr_line)[curr_comma_position:].split())
-                                >= min_words_in_utt
-                        ):
-                            f.write((curr_line)[0:curr_comma_position] + "\n")
-
-                            # update vars
-                            curr_line = curr_line[curr_comma_position + 1:]
-                            num_words = len(curr_line.split())
-                            num_commas = num_commas - 1
-                            if num_commas > 0:
-                                curr_comma_position = curr_line.index(symbol)
-                                num_words_ahead_of_curr_comma = len(
-                                    curr_line[0:curr_comma_position].split()
-                                )
-                            else:
-                                f.write(curr_line)
-                        else:
-                            # ignore too short comma (+= vs = on current comma position)
-                            num_commas = num_commas - 1
-                            if num_commas > 0:  # for say 3 commas
-                                curr_comma_position += (
-                                        curr_line[curr_comma_position + 1:].index(symbol)
-                                        + 1
-                                )
-                                num_words_ahead_of_curr_comma = len(
-                                    curr_line[0:curr_comma_position].split()
-                                )
-                            else:
-                                f.write(curr_line)
-                    else:
-                        f.write(curr_line)
-            else:
-                f.write(line)
-
-
 def diacritize_text(undiacritized_text, verbose=False):
     # manually construct the options so we don't have to pass them in.
     opt = Namespace()
@@ -339,13 +78,3 @@ def diacritize_text(undiacritized_text, verbose=False):
     )
     return prediction[0][0]
 
-
-if __name__ == "__main__":
-    # # test
-    print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))  # NFD
-    print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))  # NFC
-    print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Book_of_Mormon/cleaned/doctrine_and_covenants.txt'))
-
-    print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Owe/yoruba_proverbs_out.txt'))
-
-    # file_info("../../tests/testdata/nfc.txt")
diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
new file mode 100644
index 0000000..4201d4e
--- /dev/null
+++ b/src/iranlowo/corpus/__init__.py
@@ -0,0 +1,2 @@
+from .corpus import Corpus, DirectoryCorpus
+from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus
\ No newline at end of file
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
new file mode 100644
index 0000000..81fa2ff
--- /dev/null
+++ b/src/iranlowo/corpus/corpus.py
@@ -0,0 +1,132 @@
+import gzip
+import os
+
+from gensim import interfaces
+from gensim.corpora.csvcorpus import CsvCorpus
+from gensim.corpora.textcorpus import walk
+
+from iranlowo.preprocessing import is_valid_owé_format, normalize_diacritics_text
+from iranlowo.utils import is_text_nfc
+
+
+class Corpus(interfaces.CorpusABC):
+    def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None):
+        """
+
+        Args:
+            path:
+            text:
+        """
+        self.path = path
+        self.text = text
+        self.labels = labels
+        self.stream = stream
+        self.fformat = fformat
+        self.cformat = cformat
+        self.preprocess = preprocess
+        assert self.path or self.text, "You should pass either a path or text to read data from."
+        if not self.preprocess:
+            self.preprocess = [normalize_diacritics_text]
+        self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
+        self.validate_format()
+
+    def __iter__(self):
+        for line in self.data:
+            yield line
+
+    def __len__(self):
+        return len(self.data)
+
+    @staticmethod
+    def save_corpus(fname, corpus, id2word=None, metadata=False):
+        pass
+
+    def streamfile(self, fobj):
+        num_text = 0
+        with fobj as obj:
+            for line in obj:
+                num_text += 1
+                yield line
+
+    def read_file_filename_or_text(self, f=None, text=None):
+        """
+
+        Returns:
+
+        """
+        path = f if f else self.path
+        out = []
+        if text:
+            return self.handle_preprocessing(text) if self.preprocess else text
+        elif isinstance(path, list):
+            for f in path:
+                text = self.read_file_filename_or_text(f)
+                out.append(text)
+            return out
+        else:
+            if isinstance(path, str):
+                if self.fformat == "txt":
+                    text = open(path)
+                elif self.fformat == "csv":
+                    text = CsvCorpus(path, self.labels)
+                elif self.fformat == 'gzip':
+                    text = gzip.open(path)
+            else:
+                text = self.path.seek(0)
+
+            text = text.read() if not self.stream else ''.join(list(self.streamfile(text)))
+            return self.handle_preprocessing(text) if self.preprocess else text
+
+    def handle_preprocessing(self, text):
+        if callable(self.preprocess):
+            return self.preprocess(text)
+        if isinstance(self.preprocess, list):
+            for technique in self.preprocess:
+                text = technique(text)
+            return text
+
+    def validate_format(self):
+        """
+
+        Returns:
+
+        """
+        data = self.data
+        if isinstance(data, list):
+            data = ''.join(data)
+        if not self.cformat and not is_text_nfc(data):
+            raise TypeError("The corpus does not comply to the NFC corpus format")
+        elif self.cformat == "owe":
+            if not is_valid_owé_format(data):
+                raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat))
+            else:
+                return True
+
+    def generate(self, size):
+        """
+
+        Args:
+            size:
+
+        Returns:
+
+        """
+        if not self.cformat:
+            raise ValueError("You need to specify a format for generating random text")
+
+
+class DirectoryCorpus(Corpus):
+    def __init__(self, path, **kwargs):
+        self.dir_path = path
+        self.depth = kwargs.get('min_depth', 0)
+        self.path = list(self.read_files())
+        super(DirectoryCorpus, self).__init__(path=self.path, **kwargs)
+
+    def read_files(self):
+        walked = list(walk(self.dir_path))
+        if not walked:
+            raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path))
+        for depth, dirpath, _, filenames in walked:
+            if self.depth <= depth:
+                for path in filenames:
+                    yield os.path.join(dirpath, path)
diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
new file mode 100644
index 0000000..1314af6
--- /dev/null
+++ b/src/iranlowo/corpus/loaders.py
@@ -0,0 +1,66 @@
+import os
+
+from iranlowo.corpus import Corpus, DirectoryCorpus
+
+
+class BaseLoader(object):
+    def __init__(self, corpus_path):
+        self.corpus_path = corpus_path
+        yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None)
+        if not yoruba_text_path:
+            raise NotADirectoryError(
+                "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's "
+                "path")
+        else:
+            corpus_path = "{}/{}".format(yoruba_text_path, corpus_path)
+            self.path = corpus_path
+
+
+class YorubaBlogCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
+
+
+class BBCCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
+        super().__init__(path)
+
+
+class BibeliCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
+
+
+class en(BaseLoader, DirectoryCorpus):
+    def __init__(self):
+        BaseLoader.__init__(self, corpus_path="Owe/en")
+        DirectoryCorpus.__init__(self, path=self.path)
+
+
+class yo(BaseLoader, DirectoryCorpus):
+    def __init__(self):
+        BaseLoader.__init__(self, corpus_path="Owe/yo")
+        DirectoryCorpus.__init__(self, path=self.path)
+
+
+class OweLoader(object):
+    def __init__(self):
+        self.en = en()
+        self.yo = yo()
+
diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py
new file mode 100644
index 0000000..5ba518e
--- /dev/null
+++ b/src/iranlowo/interfaces.py
@@ -0,0 +1,14 @@
+import scrapy
+
+
+class Scrapper(scrapy.Spider):
+    """
+    Interface for scrapping data from :mod:`iranlowo.scrapper`
+    """
+
+    def __init__(self, name, urls, **kwargs):
+        super(Scrapper, self).__init__(name, **kwargs)
+
+    def parse(self, response):
+        pass
+
diff --git a/src/iranlowo/preprocessing.py b/src/iranlowo/preprocessing.py
index fe72ee7..73545fe 100644
--- a/src/iranlowo/preprocessing.py
+++ b/src/iranlowo/preprocessing.py
@@ -1,5 +1,6 @@
 import csv
 import gzip
+import unicodedata
 from pathlib import Path
 
 
@@ -90,3 +91,130 @@ def get_chunk(txt, n):
         except IndexError:
             pass  # End of file reached
 
+
+def strip_accents_text(text_string):
+    """
+    Converts the string to NFD, separates & returns only the base characters
+    :param text_string:
+    :return: input string without diacritic adornments on base characters
+    """
+    return "".join(
+        c
+        for c in unicodedata.normalize("NFD", text_string)
+        if unicodedata.category(c) != "Mn"
+    )
+
+
+def strip_accents_file(filename, outfilename):
+    """
+    Reads filename containing diacritics, converts to NFC for consistency,
+    then writes outfilename with diacritics removed
+    :param filename:
+    :param outfilename:
+    :return: None
+    """
+    text = "".join(
+        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
+    )
+    try:
+        f = open(outfilename, "w")
+    except EnvironmentError:
+        return False
+    else:
+        with f:
+            f.write(strip_accents_text(text))
+        return True
+
+
+def normalize_diacritics_text(text_string):
+    """Convenience wrapper to abstract away unicode & NFC"""
+    return unicodedata.normalize("NFC", text_string)
+
+
+def normalize_diacritics_file(filename, outfilename):
+    """File based Convenience wrapper to abstract away unicode & NFC"""
+    try:
+        text = "".join(
+            c
+            for c in unicodedata.normalize(
+                "NFC", open(filename, encoding="utf-8").read()
+            )
+        )
+        with open(outfilename, "w", encoding="utf-8") as f:
+            f.write(text)
+    except EnvironmentError:
+        return False
+    else:
+        return True
+
+
+def split_corpus_on_symbol(filename, outfilename, symbol=","):
+    """
+    For yoruba blog (and probably bibeli mimo)
+
+    Args: filenames for I/O and symbol to split lines on
+    Returns: writes outputfile
+    :param filename: input file
+    :param outfilename: processed output file to write
+    :param symbol: to split lines on
+    :return: None, with side-effect of writing an outputfile
+    """
+
+    lines = tuple(open(filename, "r", encoding="utf-8"))
+
+    min_words_to_split = 10
+    min_words_in_utt = 5
+
+    with open(outfilename, "w") as f:
+        # split out heavily comma'd text :((
+        for line in lines:
+            if symbol in line:
+                num_words = len(line.split())
+                num_commas = line.count(symbol)
+                curr_comma_position = line.index(symbol)
+                num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())
+
+                curr_line = line
+                while num_commas > 0:
+                    if num_words < min_words_to_split:
+                        # print(curr_line.strip())
+                        f.write(curr_line)
+                        break
+                    if num_words >= min_words_to_split:
+                        if (
+                                num_words_ahead_of_curr_comma >= min_words_in_utt
+                                and len(curr_line[curr_comma_position:].split())
+                                >= min_words_in_utt
+                        ):
+                            f.write(curr_line[0:curr_comma_position] + "\n")
+
+                            # update vars
+                            curr_line = curr_line[curr_comma_position + 1:]
+                            num_words = len(curr_line.split())
+                            num_commas = num_commas - 1
+                            if num_commas > 0:
+                                curr_comma_position = curr_line.index(symbol)
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                f.write(curr_line)
+                        else:
+                            # ignore too short comma (+= vs = on current comma position)
+                            num_commas = num_commas - 1
+                            if num_commas > 0:  # for say 3 commas
+                                curr_comma_position += (
+                                        curr_line[curr_comma_position + 1:].index(symbol)
+                                        + 1
+                                )
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                f.write(curr_line)
+                    else:
+                        f.write(curr_line)
+            else:
+                f.write(line)
+
+
diff --git a/src/iranlowo/utils.py b/src/iranlowo/utils.py
new file mode 100644
index 0000000..f4ab2c5
--- /dev/null
+++ b/src/iranlowo/utils.py
@@ -0,0 +1,142 @@
+import re
+import unicodedata
+from collections import defaultdict
+
+from pathlib import Path
+
+from iranlowo.preprocessing import strip_accents_text
+
+
+def is_file_nfc(path):
+    """
+
+    Args:
+        path: File path
+
+    Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct
+
+    """
+    text = open(path).read()
+    return is_text_nfc(text)
+
+
+def is_text_nfc(text):
+    """Validate unicode form of given text"""
+    nfc_text = "".join(c for c in unicodedata.normalize("NFC", text))
+    if nfc_text == text:
+        return True
+    else:
+        return False
+
+
+def string_to_path(string):
+    return Path(string)
+
+
+def file_info(filename):
+    """File metadata useful for various ADR tasks"""
+
+    print("\nFilename: " + filename)
+    print("---------------------------------")
+
+    lines = tuple(open(filename, "r", encoding="utf-8"))
+    num_utts = len(lines)
+
+    text = "".join(
+        c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
+    )
+    words = re.findall("\w+", text)
+    num_words = len(words)
+    num_chars = len(re.findall(r"\S", text))
+
+    unique_chars = set(text)
+    num_uniq_chars = len(unique_chars)
+
+    print(sorted(unique_chars))
+    print("# utts      : " + str(num_utts))
+    print("# chars     : " + str(num_chars))
+    print("# uniq chars: " + str(num_uniq_chars))
+
+    # unaccented word stats
+    unaccented_words = 0
+    for word in words:
+        if word == strip_accents_text(word):
+            unaccented_words += 1
+
+    print("# total words: " + str(num_words))
+    print("# unaccented words : " + str(unaccented_words))
+    print("-----------------------------------------------")
+
+    # ambiguous word stats
+    ambiguity_map = defaultdict(set)
+    for word in words:
+        no_accents = strip_accents_text(word)
+        ambiguity_map[no_accents].add(word)
+
+    ambiguous_words = 0
+    ambiguous_words_2 = 0
+    ambiguous_words_3 = 0
+    ambiguous_words_4 = 0
+    ambiguous_words_5 = 0
+    ambiguous_words_6 = 0
+    ambiguous_words_7 = 0
+    ambiguous_words_8 = 0
+    ambiguous_words_9 = 0
+
+    # fill ambiguity map
+    for word in ambiguity_map:
+        if len(ambiguity_map[word]) > 1:
+            ambiguous_words += 1
+        if len(ambiguity_map[word]) == 2:
+            ambiguous_words_2 += 1
+        elif len(ambiguity_map[word]) == 3:
+            ambiguous_words_3 += 1
+        elif len(ambiguity_map[word]) == 4:
+            ambiguous_words_4 += 1
+        elif len(ambiguity_map[word]) == 5:
+            ambiguous_words_5 += 1
+        elif len(ambiguity_map[word]) == 6:
+            ambiguous_words_6 += 1
+        elif len(ambiguity_map[word]) == 7:
+            ambiguous_words_7 += 1
+        elif len(ambiguity_map[word]) == 8:
+            ambiguous_words_8 += 1
+        elif len(ambiguity_map[word]) == 9:
+            ambiguous_words_9 += 1
+
+    # print ambiguity map
+    for word in ambiguity_map:
+        if len(ambiguity_map[word]) == 2:
+            print("# 2: " + str(ambiguity_map[word]))
+        if len(ambiguity_map[word]) == 3:
+            print("# 3: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 4:
+            print("# 4: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 5:
+            print("# 5: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 6:
+            print("# 6: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 7:
+            print("# 7: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 8:
+            print("# 8: " + str(ambiguity_map[word]))
+        elif len(ambiguity_map[word]) == 9:
+            print("# 9: " + str(ambiguity_map[word]))
+
+    print("# unique ambiguous words : " + str(ambiguous_words))
+    print("# total unique non-diacritized words : " + str(len(ambiguity_map)))
+
+    unique_all_words = set()
+    for word in words:
+        unique_all_words.add(word)
+
+    print("# total unique words : " + str(len(unique_all_words)))
+    print("-----------------------------------------------")
+    print("# ambiguous 2 words : " + str(ambiguous_words_2))
+    print("# ambiguous 3 words : " + str(ambiguous_words_3))
+    print("# ambiguous 4 words : " + str(ambiguous_words_4))
+    print("# ambiguous 5 words : " + str(ambiguous_words_5))
+    print("# ambiguous 6 words : " + str(ambiguous_words_6))
+    print("# ambiguous 7 words : " + str(ambiguous_words_7))
+    print("# ambiguous 8 words : " + str(ambiguous_words_8))
+    print("# ambiguous 9 words : " + str(ambiguous_words_9))
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/pred.txt b/tests/pred.txt
new file mode 100644
index 0000000..fa84c75
--- /dev/null
+++ b/tests/pred.txt
@@ -0,0 +1 @@
+ṣùgbọ́n
diff --git a/tests/test_adr.py b/tests/test_adr.py
index 7dc30de..7c9c646 100644
--- a/tests/test_adr.py
+++ b/tests/test_adr.py
@@ -2,126 +2,122 @@
 
 import filecmp
 import iranlowo.adr as ránlọ
-import os
+from iranlowo import utils
+from iranlowo import preprocessing
+import unittest
+
+from tests.utils import datapath
 
 
-def test_strip_accents_text():
-    ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889"
-    yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà"
-    yo_1 = "Kí ó tó di ààrẹ"
+class IranlowoADRTest(unittest.TestCase):
+
+    def test_strip_accents_text(self):
+        ca_fr = "Montréal, über, 12.89, Mère, Françoise, noël, 889"
+        yo_0 = "ọjọ́ìbí 18 Oṣù Keje 1918 jẹ́ Ààrẹ Gúúsù Áfríkà"
+        yo_1 = "Kí ó tó di ààrẹ"
 
-    assert ránlọ.strip_accents_text(ca_fr) == "Montreal, uber, 12.89, Mere, Francoise, noel, 889"
-    assert ránlọ.strip_accents_text(yo_0) == "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika"
-    assert ránlọ.strip_accents_text(yo_1) == "Ki o to di aare"
+        self.assertEqual(utils.strip_accents_text(ca_fr), "Montreal, uber, 12.89, Mere, Francoise, noel, 889")
+        self.assertEqual(utils.strip_accents_text(yo_0), "ojoibi 18 Osu Keje 1918 je Aare Guusu Afrika")
+        self.assertEqual(utils.strip_accents_text(yo_1), "Ki o to di aare")
 
+    def test_strip_accents_file(self):
+        src_filepath = datapath('src_file.txt')
+        reference_stripped_filepath = datapath('ref_proccessed_file.txt')
+        processed_stripped_filepath = datapath('processed_file.txt')
 
-def test_strip_accents_file():
-    cwd = os.getcwd()
-    src_filepath = cwd + "/tests/testdata/src_file.txt"
-    reference_stripped_filepath = cwd + "/tests/testdata/ref_proccessed_file.txt"
-    processed_stripped_filepath = cwd + "/tests/testdata/processed_file.txt"
+        self.assertTrue(preprocessing.strip_accents_file(src_filepath, processed_stripped_filepath))
+        self.assertFalse(filecmp.cmp(src_filepath, processed_stripped_filepath))
+        self.assertTrue(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))
 
-    assert(ránlọ.strip_accents_file(src_filepath, processed_stripped_filepath) is True)  # job completed
-    assert(filecmp.cmp(src_filepath, processed_stripped_filepath) is False)         # src & processed are different
-    assert(filecmp.cmp(reference_stripped_filepath, processed_stripped_filepath))   # processed matches reference
+    def test_is_file_nfc(self):
+        src_filepath_pass = datapath('nfc.txt')
+        src_filepath_fail = datapath('nfc_fail.txt')
 
+        self.assertTrue(utils.is_file_nfc(src_filepath_pass))
+        self.assertFalse(utils.is_file_nfc(src_filepath_fail))
 
-def test_is_file_nfc():
-    cwd = os.getcwd()
-    src_filepath_pass = cwd + "/tests/testdata/nfc.txt"
-    src_filepath_fail = cwd + "/tests/testdata/nfc_fail.txt"
-    assert (ránlọ.is_file_nfc(src_filepath_pass) is True)
-    assert (ránlọ.is_file_nfc(src_filepath_fail) is False)
+    def test_is_text_nfc(self):
+        self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))
+        self.assertFalse(utils.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?"))
 
+        self.assertTrue(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è'))
+        self.assertFalse(utils.is_text_nfc('kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è'))
 
-def test_is_text_nfc():
-    assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is False)  # NFD
-    assert(ránlọ.is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?") is True)   # NFC
-    
-    # cover diacritics that have both accents and underdots
-    assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is False)  # NFD
-    assert(ránlọ.is_text_nfc("kòsí ǹǹkan tó le ń’bẹ̀ pé káa ṣẹ̀sìn-ìn ’dílé è") is True)   # NFC
+    def test_normalize_diacritics_file(self):
+        nfd_filepath = datapath('nfd.txt')
+        reference_nfc_filepath = datapath('nfc.txt')
+        processed_nfc_filepath = datapath('processed_nfc.txt')
 
+        self.assertTrue(preprocessing.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath))
+        self.assertFalse(filecmp.cmp(nfd_filepath, processed_nfc_filepath))  # src & processed are different
+        self.assertTrue(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath))  # processed matches reference
 
-def test_normalize_diacritics_file():
-    cwd = os.getcwd()
-    nfd_filepath = cwd + "/tests/testdata/nfd.txt"
-    reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt"
-    processed_nfc_filepath = cwd + "/tests/testdata/processed_nfc.txt"
+    def test_file_info(self):
+        reference_nfc_filepath = datapath('nfc.txt')
+        utils.file_info(reference_nfc_filepath)
 
-    assert(ránlọ.normalize_diacritics_file(nfd_filepath, processed_nfc_filepath) is True)  # job completed
-    assert(filecmp.cmp(nfd_filepath, processed_nfc_filepath) is False)              # src & processed are different
-    assert(filecmp.cmp(reference_nfc_filepath, processed_nfc_filepath) is True)     # processed matches reference
+        # reference_nfc_filepath
 
+    # def test_split_corpus_on_symbol(self):
+    #     cwd = os.getcwd()
+    #     multiline_filepath = "/tests/testdata/multiline.txt"
+    #     reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt"
+    #     processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt"
+    #
+    #     assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath,
+    #                                                  reference_multiline_split_filepath, ',') is True)  # job completed
+    #     assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False)              # src & processed are different
+    #     assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True)     # processed matches reference
+    #
+    #     # try different punctuation ',', ':', etc?
 
-def test_file_info():
-    cwd = os.getcwd()
-    reference_nfc_filepath = cwd + "/tests/testdata/nfc.txt"
-    ránlọ.file_info(reference_nfc_filepath)
+    def test_diacritize_text(self):
+        predictions = ránlọ.diacritize_text("leyin igba naa")
+        self.assertEqual(predictions, "lẹ́yìn ìgbà náà")  # generated matches reference
+        self.assertNotEqual(predictions, "lẹ́yìn igbà náà")  # generated does not match incorrect reference
 
-    # reference_nfc_filepath
+        predictions = ránlọ.diacritize_text("obinrin")
+        self.assertEqual(predictions, "obìnrin")  # generated matches reference
+        self.assertNotEqual(predictions, "obinrin")  # generated does not match incorrect reference
 
-# def test_split_corpus_on_symbol():
-#     cwd = os.getcwd()
-#     multiline_filepath = "/tests/testdata/multiline.txt"
-#     reference_multiline_split_filepath = "/tests/testdata/multiline.split.txt"
-#     processed_multiline_split_filepath = "/tests/testdata/processed_multiline.split.txt"
-#
-#     assert(ránlọ.split_out_corpus_on_symbol(multiline_filepath,
-#                                                  reference_multiline_split_filepath, ',') is True)  # job completed
-#     assert(filecmp.cmp(multiline_filepath, reference_multiline_split_filepath) is False)              # src & processed are different
-#     assert(filecmp.cmp(reference_multiline_split_filepath, processed_multiline_split_filepath) is True)     # processed matches reference
-#
-#     # try different punctuation ',', ':', etc?
+        predictions = ránlọ.diacritize_text("okunrin")
+        self.assertEqual(predictions, "ọkùnrin")  # generated matches reference
+        self.assertNotEqual(predictions, "ọkunrin")  # generated does not match incorrect reference
 
+        predictions = ránlọ.diacritize_text("orisirisi")
+        self.assertEqual(predictions, "oríṣiríṣi")  # generated matches reference
+        self.assertNotEqual(predictions, "orísiríṣi")  # generated does not match incorrect reference
 
-def test_diacritize_text():
+        predictions = ránlọ.diacritize_text("nitori naa")
+        self.assertEqual(predictions, "nítorí náà")  # generated matches reference
+        self.assertNotEqual(predictions, "nitorí náà")  # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("leyin igba naa")
-    assert(predictions == "lẹ́yìn ìgbà náà")   # generated matches reference
-    assert(predictions != "lẹ́yìn igbà náà")   # generated does not match incorrect reference
+        predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo")
+        self.assertEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́")  # generated matches reference
+        self.assertNotEqual(predictions, "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́")  # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("obinrin")
-    assert(predictions == "obìnrin")   # generated matches reference
-    assert(predictions != "obinrin")   # generated does not match incorrect reference
+        # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode")
+        # assert(predictions , "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated matches reference
+        # assert(predictions , "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("okunrin")
-    assert(predictions == "ọkùnrin")   # generated matches reference
-    assert(predictions != "ọkunrin")   # generated does not match incorrect reference
+        predictions = ránlọ.diacritize_text("ati beebee lo")
+        self.assertEqual(predictions, "àti bẹ́ẹ̀bẹ́ẹ̀ lọ")  # generated matches reference
+        self.assertNotEqual(predictions, "ati bẹ́ẹ̀bẹ́ẹ̀ lọ")  # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("orisirisi")
-    assert(predictions == "oríṣiríṣi")   # generated matches reference
-    assert(predictions != "orísiríṣi")   # generated does not match incorrect reference
+        predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari")
+        self.assertEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí")  # generated matches reference
+        self.assertNotEqual(predictions, "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí")  # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("nitori naa")
-    assert(predictions == "nítorí náà")   # generated matches reference
-    assert(predictions != "nitorí náà")   # generated does not match incorrect reference
+        # predictions = ránlọ.diacritize_text("bi ase nlo yii")
+        # assert(predictions , "bí aṣe ńlọ yìí")   # generated matches reference
+        # assert(predictions , "bí ase ńlọ yìí")   # generated does not match incorrect reference
+
+        predictions = ránlọ.diacritize_text("o dabi pe")
+        self.assertEqual(predictions, "ó dàbí pé")  # generated matches reference
+        self.assertNotEqual(predictions, "ó dàbí pe")  # generated does not match incorrect reference
+
+        predictions = ránlọ.diacritize_text("sugbon")
+        self.assertEqual(predictions, "ṣùgbọ́n")  # generated matches reference
+        self.assertNotEqual(predictions, "ṣugbọ́n")  # generated does not match incorrect reference
 
-    predictions = ránlọ.diacritize_text("leyin oro mi won ko tun soro mo")
-    assert(predictions == "lẹ́yìn ọ̀rọ̀ mi wọn kò tún sọ̀rọ̀ mọ́")   # generated matches reference
-    assert(predictions != "lẹ́yìn ọ̀rọ̀ mi won kò tún sọ̀rọ̀ mọ́")   # generated does not match incorrect reference
-
-    # predictions = ránlọ.diacritize_text("awon okunrin nse ise agbara bi ise ode")
-    # assert(predictions == "àwọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated matches reference
-    # assert(predictions != "awọn ọkùnrin nṣe iṣẹ́ agbára bí iṣẹ́ ọdẹ")   # generated does not match incorrect reference
-
-    predictions = ránlọ.diacritize_text("ati beebee lo")
-    assert(predictions == "àti bẹ́ẹ̀bẹ́ẹ̀ lọ")   # generated matches reference
-    assert(predictions != "ati bẹ́ẹ̀bẹ́ẹ̀ lọ")   # generated does not match incorrect reference
-
-    predictions = ránlọ.diacritize_text("bee ni gbogbo ise ago naa ti ago ajo pari")
-    assert(predictions == "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti àgọ́ àjọ parí")   # generated matches reference
-    assert(predictions != "bẹ́ẹ̀ ni gbogbo iṣẹ́ àgọ́ náà ti agọ́ àjọ parí")   # generated does not match incorrect reference
-
-    # predictions = ránlọ.diacritize_text("bi ase nlo yii")
-    # assert(predictions == "bí aṣe ńlọ yìí")   # generated matches reference
-    # assert(predictions != "bí ase ńlọ yìí")   # generated does not match incorrect reference
-
-    predictions = ránlọ.diacritize_text("o dabi pe")
-    assert(predictions == "ó dàbí pé")   # generated matches reference
-    assert(predictions != "ó dàbí pe")   # generated does not match incorrect reference
-
-    predictions = ránlọ.diacritize_text("sugbon")
-    assert(predictions == "ṣùgbọ́n")   # generated matches reference
-    assert(predictions != "ṣugbọ́n")   # generated does not match incorrect reference
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
new file mode 100644
index 0000000..4ebc36e
--- /dev/null
+++ b/tests/test_corpus.py
@@ -0,0 +1,69 @@
+import string
+import unittest
+from pathlib import Path
+
+from iranlowo import corpus
+from tests.utils import datapath
+
+
+class TestTextCorpus(unittest.TestCase):
+    def setUp(self):
+        self.corpus_class = corpus.Corpus
+        self.directory_loader = corpus.DirectoryCorpus
+        self.txt_extension = 'txt'
+        self.csv_extension = 'csv'
+        self.gzip_extension = 'gzip'
+
+    def test_load_corpus_from_path(self):
+        path = datapath('owe_pass')
+        corpus = self.corpus_class(path=path, fformat=self.txt_extension)
+        self.assertEqual(len(corpus), 420)
+
+    def test_load_corpus_from_path_stream(self):
+        path = datapath('owe_pass')
+        corpus = self.corpus_class(path=path, fformat=self.txt_extension, stream=True)
+        self.assertEqual(len(corpus), 420)
+
+    def test_load_corpus_from_text(self):
+        text = open(datapath('owe_pass')).read()
+        corpus = self.corpus_class(text=text)
+        self.assertEqual(len(corpus), 420)
+
+    def test_load_corpus_with_preprocessing(self):
+        lines = [
+            "Àwọn obìnrin, wọn ní kiní agbára yẹn lórí àwọn ọkùnrin?",
+            "Ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?",
+            "eranko wo lo buru julo"
+        ]
+        expected = [
+            'Àwọn obìnrin wọn ní kiní agbára yẹn lórí àwọn ọkùnrin',
+            "ati gbọ́ọ rí daadaa mà, báwo ni ẹ ṣe maa ri, mà?",
+            'erankowoloburujulo'
+        ]
+
+        def punctuations(text): return text.translate(str.maketrans('', '', string.punctuation))
+
+        preprocessing = [
+            lambda x: punctuations(x), lambda x: x.lower(), lambda x: x.replace(' ', '')
+        ]
+
+        for index, entry in enumerate(lines):
+            corpus = self.corpus_class(text=entry, preprocess=preprocessing[index])
+            self.assertEqual(corpus.data, expected[index])
+
+    def test_load_corpus_from_directory(self):
+        direc = datapath('dirdata')
+        invalid_dir = datapath('test_data')
+        multi_dir = datapath()
+        path = Path(direc).glob('*')
+        dir_corpus = self.directory_loader(path=direc)
+        self.assertEqual(len(dir_corpus.data), len(list(path)))
+        with self.assertRaises(NotADirectoryError):
+            self.directory_loader(path=invalid_dir)
+        multi_corp = self.directory_loader(path=multi_dir)
+        multi_path = Path(multi_dir).glob('**/*')
+        self.assertEqual(len(multi_corp.data), len(list(multi_path))-1)
+
+    def test_save(self):
+        pass
+
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
new file mode 100644
index 0000000..7468a35
--- /dev/null
+++ b/tests/test_loaders.py
@@ -0,0 +1,12 @@
+import unittest
+
+from iranlowo import corpus
+
+
+class TestCoprusLoader(unittest.TestCase):
+    def setUp(self):
+        self.owe_loader = corpus.OweLoader
+
+    def test_load_owe(self):
+        with self.assertRaises(NotADirectoryError):
+            self.owe_loader()
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 3203e18..84dc379 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -1,10 +1,11 @@
-import os
+import unittest
 
 from iranlowo import preprocessing
+from tests.utils import datapath
 
 
-def test_is_valid_owe_format():
-    cwd = os.getcwd()
-    fail_path = cwd + "/tests/testdata/nfc.txt"
+class IranlowoCorpusTest(unittest.TestCase):
 
-    assert preprocessing.is_valid_owé_format(fail_path) is False
+    def test_is_valid_owe_format(self):
+        fail_path = datapath('nfc.txt')
+        self.assertFalse(preprocessing.is_valid_owé_format(fail_path))
diff --git a/tests/testdata/dirdata/yo_000.txt b/tests/testdata/dirdata/yo_000.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/tests/testdata/dirdata/yo_000.txt
@@ -0,0 +1 @@
+
diff --git a/tests/testdata/dirdata/yo_001.txt b/tests/testdata/dirdata/yo_001.txt
new file mode 100644
index 0000000..ab97739
--- /dev/null
+++ b/tests/testdata/dirdata/yo_001.txt
@@ -0,0 +1 @@
+A di gàárì sílẹ̀ ewúrẹ́ ńyọjú; ẹrù ìran rẹ̀ ni?
diff --git a/tests/testdata/dirdata/yo_002.txt b/tests/testdata/dirdata/yo_002.txt
new file mode 100644
index 0000000..87d7002
--- /dev/null
+++ b/tests/testdata/dirdata/yo_002.txt
@@ -0,0 +1 @@
+A fi ọ́ jọba ò ńṣàwúre o fẹ́ jẹ Ọlọ́run ni?
diff --git a/tests/testdata/dirdata/yo_003.txt b/tests/testdata/dirdata/yo_003.txt
new file mode 100644
index 0000000..822cd91
--- /dev/null
+++ b/tests/testdata/dirdata/yo_003.txt
@@ -0,0 +1 @@
+A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, bí a ò bá jà, bí a bá ti gba Awà, kò tán bí?
diff --git a/tests/testdata/dirdata/yo_004.txt b/tests/testdata/dirdata/yo_004.txt
new file mode 100644
index 0000000..c08f5e6
--- /dev/null
+++ b/tests/testdata/dirdata/yo_004.txt
@@ -0,0 +1 @@
+A gbé gàárì ọmọ ewurẹ ńrojú; kì í ṣe ẹrù àgùntàn.
diff --git a/tests/testdata/dirdata/yo_005.txt b/tests/testdata/dirdata/yo_005.txt
new file mode 100644
index 0000000..19a221e
--- /dev/null
+++ b/tests/testdata/dirdata/yo_005.txt
@@ -0,0 +1 @@
+A kì í bá ọba pàlà kí ọkọ́ ọba má ṣàn-ánni lẹ́sẹ̀.
diff --git a/tests/testdata/dirdata/yo_006.txt b/tests/testdata/dirdata/yo_006.txt
new file mode 100644
index 0000000..35113f5
--- /dev/null
+++ b/tests/testdata/dirdata/yo_006.txt
@@ -0,0 +1 @@
+A kì í bínú ààtàn ká dalẹ̀ sígbẹ̀ẹ́.
diff --git a/tests/testdata/dirdata/yo_007.txt b/tests/testdata/dirdata/yo_007.txt
new file mode 100644
index 0000000..695bae3
--- /dev/null
+++ b/tests/testdata/dirdata/yo_007.txt
@@ -0,0 +1 @@
+A kì í bínú orí ká fi fìlà dé ìbàdí.
diff --git a/tests/testdata/dirdata/yo_008.txt b/tests/testdata/dirdata/yo_008.txt
new file mode 100644
index 0000000..5123008
--- /dev/null
+++ b/tests/testdata/dirdata/yo_008.txt
@@ -0,0 +1 @@
+A kì í bẹ̀rù ikú bẹ̀rù àrùn ká ní kí ọmọ ó kú sinni.
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..8f8ed14
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,9 @@
+import os
+
+module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
+
+
+def datapath(fname=None):
+    if not fname:
+        return os.path.join(module_path, 'testdata')
+    return os.path.join(module_path, 'testdata', fname)