Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Corpus module #13

Merged
merged 6 commits into from
Aug 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ torch
numpy
requests
tqdm
google-compute-engine
gensim
271 changes: 0 additions & 271 deletions src/iranlowo/adr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,272 +4,11 @@
from __future__ import unicode_literals

import pkg_resources
import re
import unicodedata

from argparse import Namespace
from collections import defaultdict
from onmt.translate.translator import build_translator
from onmt.utils.parse import ArgumentParser


def strip_accents_text(text_string):
"""
Converts the string to NFD, separates & returns only the base characters
:param text_string:
:return: input string without diacritic adornments on base characters
"""
return "".join(
c
for c in unicodedata.normalize("NFD", text_string)
if unicodedata.category(c) != "Mn"
)


def strip_accents_file(filename, outfilename):
"""
Reads filename containing diacritics, converts to NFC for consistency,
then writes outfilename with diacritics removed
:param filename:
:param outfilename:
:return: None
"""
text = "".join(
c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
)
try:
f = open(outfilename, "w")
except EnvironmentError:
return False
else:
with f:
f.write(strip_accents_text(text))
return True


def is_file_nfc(path):
"""

Args:
path: File path

Returns: True if file is valid nfc and False if not. Raises a ValueError if path is not correct

"""
text = open(path).read()
return is_text_nfc(text)


def is_text_nfc(text):
"""Validate unicode form of given text"""
nfc_text = "".join(c for c in unicodedata.normalize("NFC", text))
if nfc_text == text:
return True
else:
return False


def normalize_diacritics_text(text_string):
"""Convenience wrapper to abstract away unicode & NFC"""
return unicodedata.normalize("NFC", text_string)


def normalize_diacritics_file(filename, outfilename):
"""File based Convenience wrapper to abstract away unicode & NFC"""
try:
text = "".join(
c
for c in unicodedata.normalize(
"NFC", open(filename, encoding="utf-8").read()
)
)
with open(outfilename, "w", encoding="utf-8") as f:
f.write(text)
except EnvironmentError:
return False
else:
return True


def file_info(filename):
"""File metadata useful for various ADR tasks"""

print("\nFilename: " + filename)
print("---------------------------------")

lines = tuple(open(filename, "r", encoding="utf-8"))
num_utts = len(lines)

text = "".join(
c for c in unicodedata.normalize("NFC", open(filename, encoding="utf-8").read())
)
words = re.findall("\w+", text)
num_words = len(words)
num_chars = len(re.findall(r"\S", text))

unique_chars = set(text)
num_uniq_chars = len(unique_chars)

print(sorted(unique_chars))
print("# utts : " + str(num_utts))
print("# chars : " + str(num_chars))
print("# uniq chars: " + str(num_uniq_chars))

# unaccented word stats
unaccented_words = 0
for word in words:
if word == strip_accents_text(word):
unaccented_words += 1

print("# total words: " + str(num_words))
print("# unaccented words : " + str(unaccented_words))
print("-----------------------------------------------")

# ambiguous word stats
ambiguity_map = defaultdict(set)
for word in words:
no_accents = strip_accents_text(word)
ambiguity_map[no_accents].add(word)

ambiguous_words = 0
ambiguous_words_2 = 0
ambiguous_words_3 = 0
ambiguous_words_4 = 0
ambiguous_words_5 = 0
ambiguous_words_6 = 0
ambiguous_words_7 = 0
ambiguous_words_8 = 0
ambiguous_words_9 = 0

# fill ambiguity map
for word in ambiguity_map:
if len(ambiguity_map[word]) > 1:
ambiguous_words += 1
if len(ambiguity_map[word]) == 2:
ambiguous_words_2 += 1
elif len(ambiguity_map[word]) == 3:
ambiguous_words_3 += 1
elif len(ambiguity_map[word]) == 4:
ambiguous_words_4 += 1
elif len(ambiguity_map[word]) == 5:
ambiguous_words_5 += 1
elif len(ambiguity_map[word]) == 6:
ambiguous_words_6 += 1
elif len(ambiguity_map[word]) == 7:
ambiguous_words_7 += 1
elif len(ambiguity_map[word]) == 8:
ambiguous_words_8 += 1
elif len(ambiguity_map[word]) == 9:
ambiguous_words_9 += 1

# print ambiguity map
for word in ambiguity_map:
if len(ambiguity_map[word]) == 2:
print("# 2: " + str(ambiguity_map[word]))
if len(ambiguity_map[word]) == 3:
print("# 3: " + str(ambiguity_map[word]))
elif len(ambiguity_map[word]) == 4:
print("# 4: " + str(ambiguity_map[word]))
elif len(ambiguity_map[word]) == 5:
print("# 5: " + str(ambiguity_map[word]))
elif len(ambiguity_map[word]) == 6:
print("# 6: " + str(ambiguity_map[word]))
elif len(ambiguity_map[word]) == 7:
print("# 7: " + str(ambiguity_map[word]))
elif len(ambiguity_map[word]) == 8:
print("# 8: " + str(ambiguity_map[word]))
elif len(ambiguity_map[word]) == 9:
print("# 9: " + str(ambiguity_map[word]))

print("# unique ambiguous words : " + str(ambiguous_words))
print("# total unique non-diacritized words : " + str(len(ambiguity_map)))

unique_all_words = set()
for word in words:
unique_all_words.add(word)

print("# total unique words : " + str(len(unique_all_words)))
print("-----------------------------------------------")
print("# ambiguous 2 words : " + str(ambiguous_words_2))
print("# ambiguous 3 words : " + str(ambiguous_words_3))
print("# ambiguous 4 words : " + str(ambiguous_words_4))
print("# ambiguous 5 words : " + str(ambiguous_words_5))
print("# ambiguous 6 words : " + str(ambiguous_words_6))
print("# ambiguous 7 words : " + str(ambiguous_words_7))
print("# ambiguous 8 words : " + str(ambiguous_words_8))
print("# ambiguous 9 words : " + str(ambiguous_words_9))


def split_corpus_on_symbol(filename, outfilename, symbol=","):
"""
For yoruba blog (and probably bibeli mimo)

Args: filenames for I/O and symbol to split lines on
Returns: writes outputfile
:param filename: input file
:param outfilename: processed output file to write
:param symbol: to split lines on
:return: None, with side-effect of writing an outputfile
"""

lines = tuple(open(filename, "r", encoding="utf-8"))

min_words_to_split = 10
min_words_in_utt = 5

with open(outfilename, "w") as f:
# split out heavily comma'd text :((
for line in lines:
if symbol in line:
num_words = len(line.split())
num_commas = line.count(symbol)
curr_comma_position = line.index(symbol)
num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())

curr_line = line
while num_commas > 0:
if num_words < min_words_to_split:
# print(curr_line.strip())
f.write(curr_line)
break
if num_words >= min_words_to_split:
if (
num_words_ahead_of_curr_comma >= min_words_in_utt
and len((curr_line)[curr_comma_position:].split())
>= min_words_in_utt
):
f.write((curr_line)[0:curr_comma_position] + "\n")

# update vars
curr_line = curr_line[curr_comma_position + 1:]
num_words = len(curr_line.split())
num_commas = num_commas - 1
if num_commas > 0:
curr_comma_position = curr_line.index(symbol)
num_words_ahead_of_curr_comma = len(
curr_line[0:curr_comma_position].split()
)
else:
f.write(curr_line)
else:
# ignore too short comma (+= vs = on current comma position)
num_commas = num_commas - 1
if num_commas > 0: # for say 3 commas
curr_comma_position += (
curr_line[curr_comma_position + 1:].index(symbol)
+ 1
)
num_words_ahead_of_curr_comma = len(
curr_line[0:curr_comma_position].split()
)
else:
f.write(curr_line)
else:
f.write(curr_line)
else:
f.write(line)


def diacritize_text(undiacritized_text, verbose=False):
# manually construct the options so we don't have to pass them in.
opt = Namespace()
Expand Down Expand Up @@ -339,13 +78,3 @@ def diacritize_text(undiacritized_text, verbose=False):
)
return prediction[0][0]


if __name__ == "__main__":
# # test
print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) # NFD
print(is_text_nfc("Kílódé, ṣèbí àdúrà le̩ fé̩ gbà nbẹ?")) # NFC
print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Book_of_Mormon/cleaned/doctrine_and_covenants.txt'))

print(is_file_nfc('/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text/Owe/yoruba_proverbs_out.txt'))

# file_info("../../tests/testdata/nfc.txt")
2 changes: 2 additions & 0 deletions src/iranlowo/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .corpus import Corpus, DirectoryCorpus
from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus
Loading