Skip to content

Commit

Permalink
allow [[REDACTED*] values to export
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Perfilov committed Mar 24, 2015
1 parent 1ce1813 commit 3658c5d
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 51 deletions.
19 changes: 13 additions & 6 deletions ckanext/datajson/build_datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def extension_to_mime_type(file_ext):


class JsonExportBuilder:

def __init__(self):
global currentPackageOrg
currentPackageOrg = None
Expand Down Expand Up @@ -357,6 +356,8 @@ def generate_distribution(package):
if 'url' in rkeys:
res_url = JsonExportBuilder.strip_if_string(r.get('url'))
if res_url:
res_url = res_url.replace('http://[[REDACTED', '[[REDACTED')
res_url = res_url.replace('http://http', 'http')
if 'api' == r.get('resource_type') or 'accessurl' == r.get('resource_type'):
resource += [("accessURL", res_url)]
else:
Expand Down Expand Up @@ -417,18 +418,24 @@ def get_contact_point(extras):
if required_field not in extras.keys():
raise KeyError(required_field)

email = JsonExportBuilder.strip_if_string(extras['contact_email'])
if email is None or '@' not in email:
raise KeyError('contact_email')

fn = JsonExportBuilder.strip_if_string(extras['contact_name'])
if fn is None:
raise KeyError('contact_name')

email = JsonExportBuilder.strip_if_string(extras['contact_email'])
if email is None:
raise KeyError('contact_email')

if '[[REDACTED' not in email:
if '@' not in email:
raise KeyError('contact_email')
else:
email = 'mailto:' + email

contact_point = OrderedDict([
('@type', 'vcard:Contact'), # optional
('fn', fn), # required
('hasEmail', 'mailto:' + email), # required
('hasEmail', email), # required
])
return contact_point

Expand Down
117 changes: 72 additions & 45 deletions ckanext/datajson/datajsonvalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,19 @@
r'(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$'
)

REDACTED_REGEX = re.compile(
r'^(\[\[REDACTED).*?(\]\])$'
)

# load the OMB bureau codes on first load of this module
import urllib, csv
import urllib
import csv

omb_burueau_codes = set()
for row in csv.DictReader(urllib.urlopen("https://project-open-data.cio.gov/data/omb_bureau_codes.csv")):
omb_burueau_codes.add(row["Agency Code"] + ":" + row["Bureau Code"])


# main function for validation
def do_validation(doc, errors_array):
errs = {}
Expand Down Expand Up @@ -136,14 +142,15 @@ def do_validation(doc, errors_array):

# contactPoint - hasEmail # required
if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs):
import lepl.apps.rfc3696
if not is_redacted(cp.get('hasEmail')):
import lepl.apps.rfc3696

email_validator = lepl.apps.rfc3696.Email()
email = cp["hasEmail"].replace('mailto:', '')
if not email_validator(email):
add_error(errs, 5, "Invalid Required Field Value",
"The email address \"%s\" is not a valid email address." % email,
dataset_name)
email_validator = lepl.apps.rfc3696.Email()
email = cp["hasEmail"].replace('mailto:', '')
if not email_validator(email):
add_error(errs, 5, "Invalid Required Field Value",
"The email address \"%s\" is not a valid email address." % email,
dataset_name)

# description # required
check_required_string_field(item, "description", 1, dataset_name, errs)
Expand All @@ -158,8 +165,9 @@ def do_validation(doc, errors_array):

# keyword # required
if isinstance(item.get("keyword"), (str, unicode)):
add_error(errs, 5, "Update Your File!",
"The keyword field used to be a string but now it must be an array.", dataset_name)
if not is_redacted(item.get("keyword")):
add_error(errs, 5, "Update Your File!",
"The keyword field used to be a string but now it must be an array.", dataset_name)
elif check_required_field(item, "keyword", list, dataset_name, errs):
for kw in item["keyword"]:
if not isinstance(kw, (str, unicode)):
Expand All @@ -171,7 +179,8 @@ def do_validation(doc, errors_array):

# modified # required
if check_required_string_field(item, "modified", 1, dataset_name, errs):
if not MODIFIED_REGEX_1.match(item['modified']) \
if not is_redacted(item['modified']) \
and not MODIFIED_REGEX_1.match(item['modified']) \
and not MODIFIED_REGEX_2.match(item['modified']) \
and not MODIFIED_REGEX_3.match(item['modified']):
add_error(errs, 5, "Invalid Required Field Value",
Expand All @@ -195,8 +204,8 @@ def do_validation(doc, errors_array):
# Required-If-Applicable

# dataQuality # Required-If-Applicable
if item.get("dataQuality") is None:
pass # not required
if item.get("dataQuality") is None or is_redacted(item.get("dataQuality")):
pass # not required or REDACTED
elif not isinstance(item["dataQuality"], bool):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'dataQuality' must be true or false, "
Expand All @@ -207,35 +216,42 @@ def do_validation(doc, errors_array):
if item.get("distribution") is None:
pass # not required
elif not isinstance(item["distribution"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'distribution' must be an array, if present.", dataset_name)
if isinstance(item["distribution"], (str, unicode)) and is_redacted(item.get("distribution")):
pass
else:
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'distribution' must be an array, if present.", dataset_name)
else:
for j, dt in enumerate(item["distribution"]):
if isinstance(dt, (str, unicode)):
if is_redacted(dt):
continue
distribution_name = dataset_name + (" distribution %d" % (j + 1))
# distribution - downloadURL # Required-If-Applicable
check_url_field(False, dt, "downloadURL", distribution_name, errs)
check_url_field(False, dt, "downloadURL", distribution_name, errs, True)

# distribution - mediaType # Required-If-Applicable
if 'downloadURL' in dt:
if check_required_string_field(dt, "mediaType", 1, distribution_name, errs):
if not IANA_MIME_REGEX.match(dt["mediaType"]):
if not IANA_MIME_REGEX.match(dt["mediaType"]) \
and not is_redacted(dt["mediaType"]):
add_error(errs, 5, "Invalid Field Value",
"The distribution mediaType \"%s\" is invalid. "
"It must be in IANA MIME format." % dt["mediaType"],
distribution_name)

# distribution - accessURL # optional
check_url_field(False, dt, "accessURL", distribution_name, errs)
check_url_field(False, dt, "accessURL", distribution_name, errs, True)

# distribution - conformsTo # optional
check_url_field(False, dt, "conformsTo", distribution_name, errs)
check_url_field(False, dt, "conformsTo", distribution_name, errs, True)

# distribution - describedBy # optional
check_url_field(False, dt, "describedBy", distribution_name, errs)
check_url_field(False, dt, "describedBy", distribution_name, errs, True)

# distribution - describedByType # optional
if dt.get("describedByType") is None:
pass # not required
if dt.get("describedByType") is None or is_redacted(dt.get("describedByType")):
pass # not required or REDACTED
elif not IANA_MIME_REGEX.match(dt["describedByType"]):
add_error(errs, 5, "Invalid Field Value",
"The describedByType \"%s\" is invalid. "
Expand All @@ -255,7 +271,7 @@ def do_validation(doc, errors_array):
check_required_string_field(dt, "title", 1, distribution_name, errs)

# license # Required-If-Applicable
check_url_field(False, item, "license", dataset_name, errs)
check_url_field(False, item, "license", dataset_name, errs, True)

# rights # Required-If-Applicable
# TODO move to warnings
Expand All @@ -269,8 +285,8 @@ def do_validation(doc, errors_array):
"The field 'spatial' must be a string value if specified.", dataset_name)

# temporal # Required-If-Applicable
if item.get("temporal") is None:
pass # not required
if item.get("temporal") is None or is_redacted(item.get("temporal")):
pass # not required or REDACTED
elif not isinstance(item["temporal"], (str, unicode)):
add_error(errs, 10, "Invalid Field Value (Optional Fields)",
"The field 'temporal' must be a string value if specified.", dataset_name)
Expand All @@ -286,19 +302,20 @@ def do_validation(doc, errors_array):
# Expanded Fields

# accrualPeriodicity # optional
if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES:
if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES \
and not is_redacted(item.get("accrualPeriodicity")):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'accrualPeriodicity' had an invalid value.", dataset_name)

# conformsTo # optional
check_url_field(False, item, "conformsTo", dataset_name, errs)
check_url_field(False, item, "conformsTo", dataset_name, errs, True)

# describedBy # optional
check_url_field(False, item, "describedBy", dataset_name, errs)
check_url_field(False, item, "describedBy", dataset_name, errs, True)

# describedByType # optional
if item.get("describedByType") is None:
pass # not required
if item.get("describedByType") is None or is_redacted(item.get("describedByType")):
pass # not required or REDACTED
elif not IANA_MIME_REGEX.match(item["describedByType"]):
add_error(errs, 5, "Invalid Field Value",
"The describedByType \"%s\" is invalid. "
Expand All @@ -310,52 +327,55 @@ def do_validation(doc, errors_array):
check_required_string_field(item, "isPartOf", 1, dataset_name, errs)

# issued # optional
if item.get("issued") is not None:
if item.get("issued") is not None and not is_redacted(item.get("issued")):
if not ISSUED_REGEX.match(item['issued']):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'issued' is not in a valid format.", dataset_name)

# landingPage # optional
check_url_field(False, item, "landingPage", dataset_name, errs)
check_url_field(False, item, "landingPage", dataset_name, errs, True)

# language # optional
if item.get("language") is None:
pass # not required
if item.get("language") is None or is_redacted(item.get("language")):
pass # not required or REDACTED
elif not isinstance(item["language"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'language' must be an array, if present.", dataset_name)
else:
for s in item["language"]:
if not LANGUAGE_REGEX.match(s):
if not LANGUAGE_REGEX.match(s) and not is_redacted(s):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'language' had an invalid language: \"%s\"" % s, dataset_name)

# PrimaryITInvestmentUII # optional
if item.get("PrimaryITInvestmentUII") is None:
pass # not required
if item.get("PrimaryITInvestmentUII") is None or is_redacted(item.get("PrimaryITInvestmentUII")):
pass # not required or REDACTED
elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(item["PrimaryITInvestmentUII"]):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'PrimaryITInvestmentUII' must be a string "
"in 023-000000001 format, if present.", dataset_name)

# references # optional
if item.get("references") is None:
pass # not required
pass # not required or REDACTED
elif not isinstance(item["references"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'references' must be an array, if present.", dataset_name)
if isinstance(item["references"], (str, unicode)) and is_redacted(item.get("references")):
pass
else:
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'references' must be an array, if present.", dataset_name)
else:
for s in item["references"]:
if not URL_REGEX.match(s):
if not URL_REGEX.match(s) and not is_redacted(s):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'references' had an invalid URL: \"%s\"" % s, dataset_name)

# systemOfRecords # optional
check_url_field(False, item, "systemOfRecords", dataset_name, errs)

# theme #optional
if item.get("theme") is None:
pass # not required
if item.get("theme") is None or is_redacted(item.get("theme")):
pass # not required or REDACTED
elif not isinstance(item["theme"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'theme' must be an array.",
dataset_name)
Expand All @@ -374,7 +394,7 @@ def do_validation(doc, errors_array):
err_type[1], # heading
[err_item + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "")
for err_item in sorted(errs[err_type], key=lambda x: (-len(errs[err_type][x]), x))
]))
]))


def add_error(errs, severity, heading, description, context=None):
Expand Down Expand Up @@ -426,11 +446,18 @@ def check_required_string_field(obj, field_name, min_length, dataset_name, errs)
return True


def check_url_field(required, obj, field_name, dataset_name, errs):
def is_redacted(field):
if isinstance(field, (str, unicode)) and REDACTED_REGEX.match(field):
return True
return False


def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False):
# checks that a required or optional field, if specified, looks like a URL
if not required and (field_name not in obj or obj[field_name] is None): return True # not required, so OK
if not check_required_field(obj, field_name, (str, unicode), dataset_name,
errs): return False # just checking data type
if allow_redacted and is_redacted(obj[field_name]): return True
if not URL_REGEX.match(obj[field_name]):
add_error(errs, 5, "Invalid Required Field Value",
"The '%s' field has an invalid URL: \"%s\"." % (field_name, obj[field_name]), dataset_name)
Expand Down

0 comments on commit 3658c5d

Please sign in to comment.