Skip to content

Commit

Permalink
Merge pull request #16 from GSA/develop
Browse files Browse the repository at this point in the history
Release 1.27
  • Loading branch information
Yatin Khadilkar committed Mar 24, 2015
2 parents dc2b004 + 3658c5d commit aa3ea0f
Show file tree
Hide file tree
Showing 20 changed files with 3,548 additions and 997 deletions.
1 change: 1 addition & 0 deletions ckanext/datajson/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pkgutil
__path__ = pkgutil.extend_path(__path__, __name__)

from plugin import JsonExportPlugin
from plugin import DataJsonPlugin
from harvester_datajson import DataJsonHarvester
from harvester_cmsdatanavigator import CmsDataNavigatorHarvester
853 changes: 496 additions & 357 deletions ckanext/datajson/build_datajson.py

Large diffs are not rendered by default.

21 changes: 9 additions & 12 deletions ckanext/datajson/build_datajsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,18 @@ def distribution_to_jsonld(distribution):
"description": "dcterms:description",
"keyword": "dcat:keyword",
"modified": "dcterms:modified",
"publisher": "dcat:publisher",
"person": "foaf:Person",
"publisher": "dcterms:publisher",
"contactPoint": "dcat:contactPoint",
"mbox": "foaf:mbox",
"identifier": "dcterms:identifier",
"accessLevel": "pod:accessLevel",

"bureauCode": "pod:bureauCode",
"programCode": "pod:programCode",
"accessLevelComment": "pod:accessLevelComment",
"dataDictionary": "dcat:dataDictionary",
"accessURL": "dcat:accessURL",
"webService": "dcat:webService",
"webService": "pod:webService",
"format": "dcterms:format", # must be a dcterms:MediaTypeOrExtent
"license": "dcterms:license",
"spatial": "dcterms:spatial", # must be a dcterms:Location entity
Expand All @@ -49,19 +53,16 @@ def distribution_to_jsonld(distribution):
"issued": "dcterms:issued",
"accrualPeriodicity": "dcterms:accrualPeriodicity", # must be a dcterms:Frequency
"language": "dcat:language", # must be an IRI
"granularity": "dcat:granularity",
"dataQuality": "xsd:boolean",
"dataQuality": "pod:dataQuality",
"theme": "dcat:theme",
"references": "dcterms:references",
"size": "dcat:size",
"landingPage": "dcat:landingPage",
"feed": "dcat:feed",
"systemOfRecords": "pod:systemOfRecords",
}

jsonld_metadata_datatypes = {
"modified": "http://www.w3.org/2001/XMLSchema#dateTime",
"issued": "http://www.w3.org/2001/XMLSchema#dateTime",
"size": "http://www.w3.org/2001/XMLSchema#decimal",
}

def apply_jsonld_metadata_mapping(data, newdict):
Expand All @@ -72,10 +73,6 @@ def apply_jsonld_metadata_mapping(data, newdict):
# skip fields with no mapping to RDF
if k not in jsonld_metadata_mapping: continue

# specially handle 'keyword' which in JSON is packed in a comma-separated field
if k == "keyword":
v = v.split(",")

# specially handle literal fields with datatypes
if k in jsonld_metadata_datatypes:
# Convert ISO datetime format to xsd:dateTime format.
Expand Down
119 changes: 73 additions & 46 deletions ckanext/datajson/datajsonvalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,19 @@
r'(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$'
)

REDACTED_REGEX = re.compile(
r'^(\[\[REDACTED).*?(\]\])$'
)

# load the OMB bureau codes on first load of this module
import urllib, csv
import urllib
import csv

omb_burueau_codes = set()
for row in csv.DictReader(urllib.urlopen("https://project-open-data.cio.gov/data/omb_bureau_codes.csv")):
omb_burueau_codes.add(row["Agency Code"] + ":" + row["Bureau Code"])


# main function for validation
def do_validation(doc, errors_array):
errs = {}
Expand Down Expand Up @@ -136,14 +142,15 @@ def do_validation(doc, errors_array):

# contactPoint - hasEmail # required
if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs):
import lepl.apps.rfc3696
if not is_redacted(cp.get('hasEmail')):
import lepl.apps.rfc3696

email_validator = lepl.apps.rfc3696.Email()
email = cp["hasEmail"].replace('mailto:', '')
if not email_validator(email):
add_error(errs, 5, "Invalid Required Field Value",
"The email address \"%s\" is not a valid email address." % email,
dataset_name)
email_validator = lepl.apps.rfc3696.Email()
email = cp["hasEmail"].replace('mailto:', '')
if not email_validator(email):
add_error(errs, 5, "Invalid Required Field Value",
"The email address \"%s\" is not a valid email address." % email,
dataset_name)

# description # required
check_required_string_field(item, "description", 1, dataset_name, errs)
Expand All @@ -158,8 +165,9 @@ def do_validation(doc, errors_array):

# keyword # required
if isinstance(item.get("keyword"), (str, unicode)):
add_error(errs, 5, "Update Your File!",
"The keyword field used to be a string but now it must be an array.", dataset_name)
if not is_redacted(item.get("keyword")):
add_error(errs, 5, "Update Your File!",
"The keyword field used to be a string but now it must be an array.", dataset_name)
elif check_required_field(item, "keyword", list, dataset_name, errs):
for kw in item["keyword"]:
if not isinstance(kw, (str, unicode)):
Expand All @@ -171,7 +179,8 @@ def do_validation(doc, errors_array):

# modified # required
if check_required_string_field(item, "modified", 1, dataset_name, errs):
if not MODIFIED_REGEX_1.match(item['modified']) \
if not is_redacted(item['modified']) \
and not MODIFIED_REGEX_1.match(item['modified']) \
and not MODIFIED_REGEX_2.match(item['modified']) \
and not MODIFIED_REGEX_3.match(item['modified']):
add_error(errs, 5, "Invalid Required Field Value",
Expand All @@ -195,8 +204,8 @@ def do_validation(doc, errors_array):
# Required-If-Applicable

# dataQuality # Required-If-Applicable
if item.get("dataQuality") is None:
pass # not required
if item.get("dataQuality") is None or is_redacted(item.get("dataQuality")):
pass # not required or REDACTED
elif not isinstance(item["dataQuality"], bool):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'dataQuality' must be true or false, "
Expand All @@ -207,35 +216,42 @@ def do_validation(doc, errors_array):
if item.get("distribution") is None:
pass # not required
elif not isinstance(item["distribution"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'distribution' must be an array, if present.", dataset_name)
if isinstance(item["distribution"], (str, unicode)) and is_redacted(item.get("distribution")):
pass
else:
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'distribution' must be an array, if present.", dataset_name)
else:
for j, dt in enumerate(item["distribution"]):
if isinstance(dt, (str, unicode)):
if is_redacted(dt):
continue
distribution_name = dataset_name + (" distribution %d" % (j + 1))
# distribution - downloadURL # Required-If-Applicable
check_url_field(False, dt, "downloadURL", distribution_name, errs)
check_url_field(False, dt, "downloadURL", distribution_name, errs, True)

# distribution - mediaType # Required-If-Applicable
if 'downloadURL' in dt:
if check_required_string_field(dt, "mediaType", 1, distribution_name, errs):
if not IANA_MIME_REGEX.match(dt["mediaType"]):
if not IANA_MIME_REGEX.match(dt["mediaType"]) \
and not is_redacted(dt["mediaType"]):
add_error(errs, 5, "Invalid Field Value",
"The distribution mediaType \"%s\" is invalid. "
"It must be in IANA MIME format." % dt["mediaType"],
distribution_name)

# distribution - accessURL # optional
check_url_field(False, dt, "accessURL", distribution_name, errs)
check_url_field(False, dt, "accessURL", distribution_name, errs, True)

# distribution - conformsTo # optional
check_url_field(False, dt, "conformsTo", distribution_name, errs)
check_url_field(False, dt, "conformsTo", distribution_name, errs, True)

# distribution - describedBy # optional
check_url_field(False, dt, "describedBy", distribution_name, errs)
check_url_field(False, dt, "describedBy", distribution_name, errs, True)

# distribution - describedByType # optional
if dt.get("describedByType") is None:
pass # not required
if dt.get("describedByType") is None or is_redacted(dt.get("describedByType")):
pass # not required or REDACTED
elif not IANA_MIME_REGEX.match(dt["describedByType"]):
add_error(errs, 5, "Invalid Field Value",
"The describedByType \"%s\" is invalid. "
Expand All @@ -255,7 +271,7 @@ def do_validation(doc, errors_array):
check_required_string_field(dt, "title", 1, distribution_name, errs)

# license # Required-If-Applicable
check_url_field(False, item, "license", dataset_name, errs)
check_url_field(False, item, "license", dataset_name, errs, True)

# rights # Required-If-Applicable
# TODO move to warnings
Expand All @@ -269,8 +285,8 @@ def do_validation(doc, errors_array):
"The field 'spatial' must be a string value if specified.", dataset_name)

# temporal # Required-If-Applicable
if item.get("temporal") is None:
pass # not required
if item.get("temporal") is None or is_redacted(item.get("temporal")):
pass # not required or REDACTED
elif not isinstance(item["temporal"], (str, unicode)):
add_error(errs, 10, "Invalid Field Value (Optional Fields)",
"The field 'temporal' must be a string value if specified.", dataset_name)
Expand All @@ -286,19 +302,20 @@ def do_validation(doc, errors_array):
# Expanded Fields

# accrualPeriodicity # optional
if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES:
if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES \
and not is_redacted(item.get("accrualPeriodicity")):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'accrualPeriodicity' had an invalid value.", dataset_name)

# conformsTo # optional
check_url_field(False, item, "conformsTo", dataset_name, errs)
check_url_field(False, item, "conformsTo", dataset_name, errs, True)

# describedBy # optional
check_url_field(False, item, "describedBy", dataset_name, errs)
check_url_field(False, item, "describedBy", dataset_name, errs, True)

# describedByType # optional
if item.get("describedByType") is None:
pass # not required
if item.get("describedByType") is None or is_redacted(item.get("describedByType")):
pass # not required or REDACTED
elif not IANA_MIME_REGEX.match(item["describedByType"]):
add_error(errs, 5, "Invalid Field Value",
"The describedByType \"%s\" is invalid. "
Expand All @@ -310,52 +327,55 @@ def do_validation(doc, errors_array):
check_required_string_field(item, "isPartOf", 1, dataset_name, errs)

# issued # optional
if item.get("issued") is not None:
if item.get("issued") is not None and not is_redacted(item.get("issued")):
if not ISSUED_REGEX.match(item['issued']):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'issued' is not in a valid format.", dataset_name)

# landingPage # optional
check_url_field(False, item, "landingPage", dataset_name, errs)
check_url_field(False, item, "landingPage", dataset_name, errs, True)

# language # optional
if item.get("language") is None:
pass # not required
if item.get("language") is None or is_redacted(item.get("language")):
pass # not required or REDACTED
elif not isinstance(item["language"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'language' must be an array, if present.", dataset_name)
else:
for s in item["language"]:
if not LANGUAGE_REGEX.match(s):
if not LANGUAGE_REGEX.match(s) and not is_redacted(s):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'language' had an invalid language: \"%s\"" % s, dataset_name)

# PrimaryITInvestmentUII # optional
if item.get("PrimaryITInvestmentUII") is None:
pass # not required
if item.get("PrimaryITInvestmentUII") is None or is_redacted(item.get("PrimaryITInvestmentUII")):
pass # not required or REDACTED
elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(item["PrimaryITInvestmentUII"]):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'PrimaryITInvestmentUII' must be a string "
"in 023-000000001 format, if present.", dataset_name)

# references # optional
if item.get("references") is None:
pass # not required
pass # not required or REDACTED
elif not isinstance(item["references"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'references' must be an array, if present.", dataset_name)
if isinstance(item["references"], (str, unicode)) and is_redacted(item.get("references")):
pass
else:
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'references' must be an array, if present.", dataset_name)
else:
for s in item["references"]:
if not URL_REGEX.match(s):
if not URL_REGEX.match(s) and not is_redacted(s):
add_error(errs, 50, "Invalid Field Value (Optional Fields)",
"The field 'references' had an invalid URL: \"%s\"" % s, dataset_name)

# systemOfRecords # optional
check_url_field(False, item, "systemOfRecords", dataset_name, errs)

# theme #optional
if item.get("theme") is None:
pass # not required
if item.get("theme") is None or is_redacted(item.get("theme")):
pass # not required or REDACTED
elif not isinstance(item["theme"], list):
add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'theme' must be an array.",
dataset_name)
Expand All @@ -374,7 +394,7 @@ def do_validation(doc, errors_array):
err_type[1], # heading
[err_item + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "")
for err_item in sorted(errs[err_type], key=lambda x: (-len(errs[err_type][x]), x))
]))
]))


def add_error(errs, severity, heading, description, context=None):
Expand All @@ -397,7 +417,7 @@ def check_required_field(obj, field_name, data_type, dataset_name, errs):
add_error(errs, 10, "Missing Required Fields", "The '%s' field is missing." % field_name, dataset_name)
return False
elif obj[field_name] is None:
add_error(errs, 10, "Missing Required Fields", "The '%s' field is set to null." % field_name, dataset_name)
add_error(errs, 10, "Missing Required Fields", "The '%s' field is empty." % field_name, dataset_name)
return False
elif not isinstance(obj[field_name], data_type):
add_error(errs, 5, "Invalid Required Field Value",
Expand Down Expand Up @@ -426,11 +446,18 @@ def check_required_string_field(obj, field_name, min_length, dataset_name, errs)
return True


def check_url_field(required, obj, field_name, dataset_name, errs):
def is_redacted(field):
if isinstance(field, (str, unicode)) and REDACTED_REGEX.match(field):
return True
return False


def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False):
# checks that a required or optional field, if specified, looks like a URL
if not required and (field_name not in obj or obj[field_name] is None): return True # not required, so OK
if not check_required_field(obj, field_name, (str, unicode), dataset_name,
errs): return False # just checking data type
if allow_redacted and is_redacted(obj[field_name]): return True
if not URL_REGEX.match(obj[field_name]):
add_error(errs, 5, "Invalid Required Field Value",
"The '%s' field has an invalid URL: \"%s\"." % (field_name, obj[field_name]), dataset_name)
Expand Down
Loading

0 comments on commit aa3ea0f

Please sign in to comment.