diff --git a/ckanext/datajson/__init__.py b/ckanext/datajson/__init__.py index f20145e9..b68480c0 100644 --- a/ckanext/datajson/__init__.py +++ b/ckanext/datajson/__init__.py @@ -6,6 +6,7 @@ import pkgutil __path__ = pkgutil.extend_path(__path__, __name__) +from plugin import JsonExportPlugin from plugin import DataJsonPlugin from harvester_datajson import DataJsonHarvester from harvester_cmsdatanavigator import CmsDataNavigatorHarvester diff --git a/ckanext/datajson/build_datajson.py b/ckanext/datajson/build_datajson.py index de7be113..f8a704cf 100644 --- a/ckanext/datajson/build_datajson.py +++ b/ckanext/datajson/build_datajson.py @@ -11,375 +11,93 @@ log = logging.getLogger('datajson') -# TODO this file is pretty sloppy, needs cleanup and redundancies removed - -def make_datajson_catalog(datasets): - catalog = OrderedDict([ - ('conformsTo', 'https://project-open-data.cio.gov/v1.1/schema'), # requred - ('describedBy', 'https://project-open-data.cio.gov/v1.1/schema/catalog.json'), # optional - ('@context', 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld'), # optional - ('@type', 'dcat:Catalog'), # optional - ('dataset', datasets), # required - ]) - return catalog - - -def make_datajson_entry(package): - # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict - extras = dict([(x['key'], x['value']) for x in package['extras']]) - - parent_dataset_id = extras.get('parent_dataset') - if parent_dataset_id: - parent = model.Package.get(parent_dataset_id) - parent_uid = parent.extras.col.target['unique_id'].value - if parent_uid: - parent_dataset_id = parent_uid - - # if resource format is CSV then convert it to text/csv - # Resource format has to be in 'csv' format for automatic datastore push. - for r in package["resources"]: - if r["format"].lower() == "csv": - r["format"] = "text/csv" - if r["format"].lower() == "json": - r["format"] = "application/json" - if r["format"].lower() == "pdf": - r["format"] = "application/pdf" - - try: - retlist = [ - ("@type", "dcat:Dataset"), # optional - - ("title", strip_if_string(package["title"])), # required - - # ("accessLevel", 'public'), # required - ("accessLevel", strip_if_string(extras.get('public_access_level'))), # required - - # ("accrualPeriodicity", "R/P1Y"), # optional - # ('accrualPeriodicity', 'accrual_periodicity'), - ('accrualPeriodicity', get_accrual_periodicity(extras.get('accrual_periodicity'))), # optional - - ("conformsTo", strip_if_string(extras.get('conforms_to'))), # optional - - # ('contactPoint', OrderedDict([ - # ("@type", "vcard:Contact"), - # ("fn", "Jane Doe"), - # ("hasEmail", "mailto:jane.doe@agency.gov") - # ])), # required - ('contactPoint', get_contact_point(extras, package)), # required - - ("dataQuality", strip_if_string(extras.get('data_quality'))), # required-if-applicable - - ("describedBy", strip_if_string(extras.get('data_dictionary'))), # optional - ("describedByType", strip_if_string(extras.get('data_dictionary_type'))), # optional - - ("description", strip_if_string(package["notes"])), # required - - # ("description", 'asdfasdf'), # required - - ("identifier", strip_if_string(extras.get('unique_id'))), # required - # ("identifier", 'asdfasdfasdf'), # required - - ("isPartOf", parent_dataset_id), # optional - ("issued", strip_if_string(extras.get('release_date'))), # optional - - # ("keyword", ['a', 'b']), # required - ("keyword", [t["display_name"] for t in package["tags"]]), # required - - ("landingPage", strip_if_string(extras.get('homepage_url'))), # optional - - ("license", strip_if_string(extras.get("license_new"))), # required-if-applicable - - ("modified", strip_if_string(extras.get("modified"))), # required - - ("primaryITInvestmentUII", strip_if_string(extras.get('primary_it_investment_uii'))), # optional - - # ('publisher', OrderedDict([ - # ("@type", "org:Organization"), - # ("name", "Widget Services") - # ])), # required - # ("publisher", get_publisher_tree(extras)), # required - ("publisher", get_publisher_tree_wrong_order(extras)), # required - - ("rights", strip_if_string(extras.get('access_level_comment'))), # required - - ("spatial", strip_if_string(package.get("spatial"))), # required-if-applicable - - ('systemOfRecords', strip_if_string(extras.get('system_of_records'))), # optional - - ("temporal", strip_if_string(extras.get('temporal'))), # required-if-applicable - - ("distribution", generate_distribution(package)), # required-if-applicable - - # ("distribution", - # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) - # [ - # OrderedDict([ - # ("downloadURL", r["url"]), - # ("mediaType", r["formatReadable"]), - # ]) - # for r in package["resources"] - # ]) - ] - - for pair in [ - ('bureauCode', 'bureau_code'), # required - ('language', 'language'), # optional - ('programCode', 'program_code'), # required - ('references', 'related_documents'), # optional - ('theme', 'category'), # optional - ]: - split_multiple_entries(retlist, extras, pair) - - except KeyError as e: - log.warn("Invalid field detected for package with id=[%s], title=['%s']: '%s'", package.get('id'), - package.get('title'), e) - return - - # # TODO this is a lazy hack to make sure we don't have redundant fields when the free form key/value pairs are added - # extras_to_filter_out = ['publisher', 'contact_name', 'contact_email', 'unique_id', 'public_access_level', - # 'data_dictionary', 'bureau_code', 'program_code', 'access_level_comment', 'license_title', - # 'spatial', 'temporal', 'release_date', 'accrual_periodicity', 'language', 'granularity', - # 'data_quality', 'size', 'homepage_url', 'rss_feed', 'category', 'related_documents', - # 'system_of_records', 'system_of_records_none_related_to_this_dataset', 'tags', - # 'extrasRollup', 'format', 'accessURL', 'notes', 'publisher_1', 'publisher_2', 'publisher_3', - # 'publisher_4', 'publisher_5'] - # - # # Append any free extras (key/value pairs) that aren't part of common core but have been associated with the dataset - # # TODO really hackey, short on time, had to hardcode a lot of the names to remove. there's much better ways, maybe - # # generate a list of keys to ignore by calling a specific function to get the extras - # retlist_keys = [x for x, y in retlist] - # extras_keys = set(extras.keys()) - set(extras_to_filter_out) - # - # for key in extras_keys: - # convertedKey = underscore_to_camelcase(key) - # if convertedKey not in retlist_keys: - # retlist.append((convertedKey, extras[key])) - - # Remove entries where value is None, "", or empty list [] - striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] - striped_retlist_keys = [x for x, y in striped_retlist] - - - # If a required metadata field was removed, return empty string - # for required_field in ["accessLevel", "bureauCode", "contactPoint", "description", "identifier", "keyword", - # "modified", "programCode", "publisher", "title"]: - # if required_field not in striped_retlist_keys: - # log.warn("Missing required field detected for package with id=[%s], title=['%s']: '%s'", - # package.get('id'), package.get('title'), required_field) - # return - - # When saved from UI DataQuality value is stored as "on" instead of True. - # Check if value is "on" and replace it with True. - striped_retlist_dict = OrderedDict(striped_retlist) - if striped_retlist_dict.get('dataQuality') == "on" \ - or striped_retlist_dict.get('dataQuality') == "true" \ - or striped_retlist_dict.get('dataQuality') == "True": - striped_retlist_dict['dataQuality'] = True - elif striped_retlist_dict.get('dataQuality') == "false" \ - or striped_retlist_dict.get('dataQuality') == "False": - striped_retlist_dict['dataQuality'] = False - - from datajsonvalidator import do_validation - - errors = [] - try: - do_validation([dict(striped_retlist_dict)], errors) - except Exception as e: - errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) - if len(errors) > 0: - for error in errors: - log.warn(error) - return - - return striped_retlist_dict - - -# used by get_accrual_periodicity -accrual_periodicity_dict = { - 'completely irregular': 'irregular', - 'decennial': 'R/P10Y', - 'quadrennial': 'R/P4Y', - 'annual': 'R/P1Y', - 'bimonthly': 'R/P2M', # or R/P0.5M - 'semiweekly': 'R/P3.5D', - 'daily': 'R/P1D', - 'biweekly': 'R/P2W', # or R/P0.5W - 'semiannual': 'R/P6M', - 'biennial': 'R/P2Y', - 'triennial': 'R/P3Y', - 'three times a week': 'R/P0.33W', - 'three times a month': 'R/P0.33M', - 'continuously updated': 'R/PT1S', - 'monthly': 'R/P1M', - 'quarterly': 'R/P3M', - 'semimonthly': 'R/P0.5M', - 'three times a year': 'R/P4M', - 'weekly': 'R/P1W' -} - - -def get_accrual_periodicity(frequency): - return accrual_periodicity_dict.get(str(frequency).lower().strip(), frequency) - - -def generate_distribution(package): - arr = [] - for r in package["resources"]: - resource = [("@type", "dcat:Distribution")] - rkeys = r.keys() - if 'url' in rkeys: - res_url = strip_if_string(r.get('url')) - if res_url: - if 'api' == r.get('resource_type') or 'accessurl' == r.get('resource_type'): - resource += [("accessURL", res_url)] - else: - resource += [("downloadURL", res_url)] - if 'format' in rkeys: - res_format = strip_if_string(r.get('format')) - if res_format: - resource += [("mediaType", res_format)] - else: - log.warn("Missing mediaType for resource in package ['%s']", package.get('id')) - else: - log.warn("Missing downloadURL for resource in package ['%s']", package.get('id')) - - # if 'accessURL_new' in rkeys: - # res_access_url = strip_if_string(r.get('accessURL_new')) - # if res_access_url: - # resource += [("accessURL", res_access_url)] - - if 'formatReadable' in rkeys: - res_attr = strip_if_string(r.get('formatReadable')) - if res_attr: - resource += [("format", res_attr)] - - if 'name' in rkeys: - res_attr = strip_if_string(r.get('name')) - if res_attr: - resource += [("title", res_attr)] - - if 'notes' in rkeys: - res_attr = strip_if_string(r.get('notes')) - if res_attr: - resource += [("description", res_attr)] - if 'conformsTo' in rkeys: - res_attr = strip_if_string(r.get('conformsTo')) - if res_attr: - resource += [("conformsTo", res_attr)] +def get_facet_fields(): + # Return fields that we'd like to add to default CKAN faceting. This really has + # nothing to do with exporting data.json but it's probably a common consideration. + facets = OrderedDict() - if 'describedBy' in rkeys: - res_attr = strip_if_string(r.get('describedBy')) - if res_attr: - resource += [("describedBy", res_attr)] + # using "author" produces weird results because the Solr schema indexes it as "text" rather than "string" + facets["Agency"] = "Publishers" + # search facets remove spaces from field names + facets["SubjectArea1"] = "Subjects" + return facets - if 'describedByType' in rkeys: - res_attr = strip_if_string(r.get('describedByType')) - if res_attr: - resource += [("describedByType", res_attr)] - striped_resource = [(x, y) for x, y in resource if y is not None and y != "" and y != []] - - arr += [OrderedDict(striped_resource)] - - return arr - - -def get_contact_point(extras, package): - for required_field in ["contact_name", "contact_email"]: - if required_field not in extras.keys(): - raise KeyError(required_field) - - email = strip_if_string(extras['contact_email']) - if email is None or '@' not in email: - raise KeyError(required_field) - - fn = strip_if_string(extras['contact_name']) - if fn is None: - raise KeyError(required_field) - - contact_point = OrderedDict([ - ('@type', 'vcard:Contact'), # optional - ('fn', fn), # required - ('hasEmail', 'mailto:' + email), # required +def make_datajson_entry(package): + return OrderedDict([ + ("title", package["title"]), + ("description", package["notes"]), + ("keyword", [t["display_name"] for t in package["tags"]]), + ("modified", extra(package, "Date Updated")), + ("publisher", package["author"]), + ("bureauCode", extra(package, "Bureau Code").split(" ") if extra(package, "Bureau Code") else None), + ("programCode", extra(package, "Program Code").split(" ") if extra(package, "Program Code") else None), + ("contactPoint", extra(package, "Contact Name")), + ("mbox", extra(package, "Contact Email")), + ("identifier", package["id"]), + ("accessLevel", extra(package, "Access Level", default="public")), + ("accessLevelComment", extra(package, "Access Level Comment")), + ("dataDictionary", extra(package, "Data Dictionary")), + ("accessURL", get_primary_resource(package).get("url", None)), + ("webService", get_api_resource(package).get("url", None)), + ("format", extension_to_mime_type(get_primary_resource(package).get("format", None))), + ("license", extra(package, "License Agreement")), + ("spatial", extra(package, "Geographic Scope")), + ("temporal", build_temporal(package)), + ("issued", extra(package, "Date Released")), + ("accrualPeriodicity", extra(package, "Publish Frequency")), + ("language", extra(package, "Language")), + ("PrimaryITInvestmentUII", extra(package, "PrimaryITInvestmentUII")), + ("granularity", "/".join( + x for x in [extra(package, "Unit of Analysis"), extra(package, "Geographic Granularity")] if + x is not None)), + ("dataQuality", extra(package, "Data Quality Met", default="true") == "true"), + ("theme", [s for s in ( + extra(package, "Subject Area 1"), extra(package, "Subject Area 2"), extra(package, "Subject Area 3") + ) if s is not None]), + + ("references", [s for s in [extra(package, "Technical Documentation")] if s is not None]), + ("landingPage", package["url"]), + ("systemOfRecords", extra(package, "System Of Records")), + ("distribution", + [ + OrderedDict([ + ("identifier", r["id"]), # NOT in POD standard, but useful for conversion to JSON-LD + ("accessURL", r["url"]), + ("format", r.get("mimetype", extension_to_mime_type(r["format"]))), + ]) + for r in package["resources"] + if r["format"].lower() not in ("api", "query tool", "widget") + ]), ]) - return contact_point def extra(package, key, default=None): # Retrieves the value of an extras field. - for extra in package["extras"]: - if extra["key"] == key: - return extra["value"] + for xtra in package["extras"]: + if xtra["key"] == key: + return xtra["value"] return default -def get_publisher_tree_wrong_order(extras): - publisher = strip_if_string(extras.get('publisher')) - if publisher is None: - raise KeyError('publisher') - - organization_list = list() - organization_list.append([ - ('@type', 'org:Organization'), # optional - ('name', publisher), # required - ]) - - for i in range(1, 6): - key = 'publisher_' + str(i) - if key in extras and extras[key] and strip_if_string(extras[key]): - organization_list.append([ - ('@type', 'org:Organization'), # optional - ('name', strip_if_string(extras[key])), # required - ]) - - size = len(organization_list) - - # [OSCIT, GSA] - # organization_list.reverse() - # [GSA, OSCIT] - - tree = False - for i in range(0, size): - if tree: - organization_list[i] += [('subOrganizationOf', OrderedDict(tree))] - tree = organization_list[i] - - return OrderedDict(tree) - - -def underscore_to_camelcase(value): - """ - Convert underscored strings to camel case, e.g. one_two_three to oneTwoThree - """ - - def camelcase(): - yield unicode.lower - while True: - yield unicode.capitalize - - c = camelcase() - return "".join(c.next()(x) if x else '_' for x in value.split("_")) - - -def get_best_resource(package, acceptable_formats): +def get_best_resource(package, acceptable_formats, unacceptable_formats=None): resources = list(r for r in package["resources"] if r["format"].lower() in acceptable_formats) - if len(resources) == 0: return {} - resources.sort(key=lambda r: acceptable_formats.index(r["format"].lower())) + if len(resources) == 0: + if unacceptable_formats: + # try at least any resource that's not unacceptable + resources = list(r for r in package["resources"] if r["format"].lower() not in unacceptable_formats) + if len(resources) == 0: + # there is no acceptable resource to show + return {} + else: + resources.sort(key=lambda r: acceptable_formats.index(r["format"].lower())) return resources[0] -def strip_if_string(val): - if isinstance(val, (str, unicode)): - val = val.strip() - if '' == val: - val = None - return val - - def get_primary_resource(package): # Return info about a "primary" resource. Select a good one. - return get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf")) + return get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf"), ("api", "query tool", "widget")) def get_api_resource(package): @@ -387,9 +105,430 @@ def get_api_resource(package): return get_best_resource(package, ("api", "query tool")) -def split_multiple_entries(retlist, extras, names): - found_element = string.strip(extras.get(names[1], "")) - if found_element: - retlist.append( - (names[0], [string.strip(x) for x in string.split(found_element, ',')]) - ) +def build_temporal(package): + # Build one dataset entry of the data.json file. + if extra(package, "Coverage Period Fiscal Year Start"): + temporal = "FY" + extra(package, "Coverage Period Fiscal Year Start").replace(" ", "T").replace("T00:00:00", "") + else: + temporal = extra(package, "Coverage Period Start", "Unknown").replace(" ", "T").replace("T00:00:00", "") + temporal += "/" + if extra(package, "Coverage Period Fiscal Year End"): + temporal += "FY" + extra(package, "Coverage Period Fiscal Year End").replace(" ", "T").replace("T00:00:00", "") + else: + temporal += extra(package, "Coverage Period End", "Unknown").replace(" ", "T").replace("T00:00:00", "") + if temporal == "Unknown/Unknown": return None + return temporal + + +def extension_to_mime_type(file_ext): + if file_ext is None: return None + ext = { + "csv": "text/csv", + "xls": "application/vnd.ms-excel", + "xml": "application/xml", + "rdf": "application/rdf+xml", + "json": "application/json", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "text": "text/plain", + "feed": "application/rss+xml", + } + return ext.get(file_ext.lower(), "application/unknown") + + +currentPackageOrg = None + + +class JsonExportBuilder: + def __init__(self): + global currentPackageOrg + currentPackageOrg = None + + @staticmethod + def make_datajson_export_catalog(datasets): + catalog = OrderedDict([ + ('conformsTo', 'https://project-open-data.cio.gov/v1.1/schema'), # requred + ('describedBy', 'https://project-open-data.cio.gov/v1.1/schema/catalog.json'), # optional + ('@context', 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld'), # optional + ('@type', 'dcat:Catalog'), # optional + ('dataset', datasets), # required + ]) + return catalog + + @staticmethod + def make_datajson_export_entry(package): + global currentPackageOrg + currentPackageOrg = None + # extras is a list of dicts [{},{}, {}]. For each dict, extract the key, value entries into a new dict + extras = dict([(x['key'], x['value']) for x in package['extras']]) + + parent_dataset_id = extras.get('parent_dataset') + if parent_dataset_id: + parent = model.Package.get(parent_dataset_id) + parent_uid = parent.extras.col.target['unique_id'].value + if parent_uid: + parent_dataset_id = parent_uid + + # if resource format is CSV then convert it to text/csv + # Resource format has to be in 'csv' format for automatic datastore push. + for r in package["resources"]: + if r["format"].lower() == "csv": + r["format"] = "text/csv" + if r["format"].lower() == "json": + r["format"] = "application/json" + if r["format"].lower() == "pdf": + r["format"] = "application/pdf" + + try: + retlist = [ + ("@type", "dcat:Dataset"), # optional + + ("title", JsonExportBuilder.strip_if_string(package["title"])), # required + + # ("accessLevel", 'public'), # required + ("accessLevel", JsonExportBuilder.strip_if_string(extras.get('public_access_level'))), # required + + # ("accrualPeriodicity", "R/P1Y"), # optional + # ('accrualPeriodicity', 'accrual_periodicity'), + ('accrualPeriodicity', JsonExportBuilder.get_accrual_periodicity(extras.get('accrual_periodicity'))), + # optional + + ("conformsTo", JsonExportBuilder.strip_if_string(extras.get('conforms_to'))), # optional + + # ('contactPoint', OrderedDict([ + # ("@type", "vcard:Contact"), + # ("fn", "Jane Doe"), + # ("hasEmail", "mailto:jane.doe@agency.gov") + # ])), # required + ('contactPoint', JsonExportBuilder.get_contact_point(extras)), # required + + ("dataQuality", JsonExportBuilder.strip_if_string(extras.get('data_quality'))), + # required-if-applicable + + ("describedBy", JsonExportBuilder.strip_if_string(extras.get('data_dictionary'))), # optional + ("describedByType", JsonExportBuilder.strip_if_string(extras.get('data_dictionary_type'))), # optional + + ("description", JsonExportBuilder.strip_if_string(package["notes"])), # required + + # ("description", 'asdfasdf'), # required + + ("identifier", JsonExportBuilder.strip_if_string(extras.get('unique_id'))), # required + # ("identifier", 'asdfasdfasdf'), # required + + ("isPartOf", parent_dataset_id), # optional + ("issued", JsonExportBuilder.strip_if_string(extras.get('release_date'))), # optional + + # ("keyword", ['a', 'b']), # required + ("keyword", [t["display_name"] for t in package["tags"]]), # required + + ("landingPage", JsonExportBuilder.strip_if_string(extras.get('homepage_url'))), # optional + + ("license", JsonExportBuilder.strip_if_string(extras.get("license_new"))), # required-if-applicable + + ("modified", + JsonExportBuilder.strip_if_string(extras.get("modified", package.get("metadata_modified")))), + # required + + ("primaryITInvestmentUII", JsonExportBuilder.strip_if_string(extras.get('primary_it_investment_uii'))), + # optional + + # ('publisher', OrderedDict([ + # ("@type", "org:Organization"), + # ("name", "Widget Services") + # ])), # required + # ("publisher", get_publisher_tree(extras)), # required + ("publisher", JsonExportBuilder.get_publisher_tree_wrong_order(extras)), # required + + ("rights", JsonExportBuilder.strip_if_string(extras.get('access_level_comment'))), # required + + ("spatial", JsonExportBuilder.strip_if_string(package.get("spatial"))), # required-if-applicable + + ('systemOfRecords', JsonExportBuilder.strip_if_string(extras.get('system_of_records'))), # optional + + ("temporal", JsonExportBuilder.strip_if_string(extras.get('temporal'))), # required-if-applicable + + ("distribution", JsonExportBuilder.generate_distribution(package)), # required-if-applicable + + # ("distribution", + # #TODO distribution should hide any key/value pairs where value is "" or None (e.g. format) + # [ + # OrderedDict([ + # ("downloadURL", r["url"]), + # ("mediaType", r["formatReadable"]), + # ]) + # for r in package["resources"] + # ]) + ] + + for pair in [ + ('bureauCode', 'bureau_code'), # required + ('language', 'language'), # optional + ('programCode', 'program_code'), # required + ('references', 'related_documents'), # optional + ('theme', 'category'), # optional + ]: + JsonExportBuilder.split_multiple_entries(retlist, extras, pair) + + except KeyError as e: + log.warn("Missing Required Field for package with id=[%s], title=['%s'], organization=['%s']: '%s'" % ( + package.get('id'), package.get('title'), currentPackageOrg, e)) + + errors = ['Missing Required Field', ["%s" % e]] + errors_dict = OrderedDict([ + ('id', package.get('id')), + ('name', package.get('name')), + ('title', package.get('title')), + ('organization', currentPackageOrg), + ('errors', errors), + ]) + + return errors_dict + + # Remove entries where value is None, "", or empty list [] + striped_retlist = [(x, y) for x, y in retlist if y is not None and y != "" and y != []] + + # When saved from UI DataQuality value is stored as "on" instead of True. + # Check if value is "on" and replace it with True. + striped_retlist_dict = OrderedDict(striped_retlist) + if striped_retlist_dict.get('dataQuality') == "on" \ + or striped_retlist_dict.get('dataQuality') == "true" \ + or striped_retlist_dict.get('dataQuality') == "True": + striped_retlist_dict['dataQuality'] = True + elif striped_retlist_dict.get('dataQuality') == "false" \ + or striped_retlist_dict.get('dataQuality') == "False": + striped_retlist_dict['dataQuality'] = False + + from datajsonvalidator import do_validation + + errors = [] + try: + do_validation([dict(striped_retlist_dict)], errors) + except Exception as e: + errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) + if len(errors) > 0: + for error in errors: + log.warn(error) + + errors_dict = OrderedDict([ + ('id', package.get('id')), + ('name', package.get('name')), + ('title', package.get('title')), + ('organization', currentPackageOrg), + ('errors', errors), + ]) + + return errors_dict + + return striped_retlist_dict + + # used by get_accrual_periodicity + accrual_periodicity_dict = { + 'completely irregular': 'irregular', + 'decennial': 'R/P10Y', + 'quadrennial': 'R/P4Y', + 'annual': 'R/P1Y', + 'bimonthly': 'R/P2M', # or R/P0.5M + 'semiweekly': 'R/P3.5D', + 'daily': 'R/P1D', + 'biweekly': 'R/P2W', # or R/P0.5W + 'semiannual': 'R/P6M', + 'biennial': 'R/P2Y', + 'triennial': 'R/P3Y', + 'three times a week': 'R/P0.33W', + 'three times a month': 'R/P0.33M', + 'continuously updated': 'R/PT1S', + 'monthly': 'R/P1M', + 'quarterly': 'R/P3M', + 'semimonthly': 'R/P0.5M', + 'three times a year': 'R/P4M', + 'weekly': 'R/P1W' + } + + @staticmethod + def get_accrual_periodicity(frequency): + return JsonExportBuilder.accrual_periodicity_dict.get(str(frequency).lower().strip(), frequency) + + @staticmethod + def generate_distribution(package): + arr = [] + for r in package["resources"]: + resource = [("@type", "dcat:Distribution")] + rkeys = r.keys() + if 'url' in rkeys: + res_url = JsonExportBuilder.strip_if_string(r.get('url')) + if res_url: + res_url = res_url.replace('http://[[REDACTED', '[[REDACTED') + res_url = res_url.replace('http://http', 'http') + if 'api' == r.get('resource_type') or 'accessurl' == r.get('resource_type'): + resource += [("accessURL", res_url)] + else: + resource += [("downloadURL", res_url)] + if 'format' in rkeys: + res_format = JsonExportBuilder.strip_if_string(r.get('format')) + if res_format: + resource += [("mediaType", res_format)] + else: + log.warn("Missing mediaType for resource in package ['%s']", package.get('id')) + else: + log.warn("Missing downloadURL for resource in package ['%s']", package.get('id')) + + # if 'accessURL_new' in rkeys: + # res_access_url = JsonExportBuilder.strip_if_string(r.get('accessURL_new')) + # if res_access_url: + # resource += [("accessURL", res_access_url)] + + if 'formatReadable' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('formatReadable')) + if res_attr: + resource += [("format", res_attr)] + + if 'name' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('name')) + if res_attr: + resource += [("title", res_attr)] + + if 'notes' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('notes')) + if res_attr: + resource += [("description", res_attr)] + + if 'conformsTo' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('conformsTo')) + if res_attr: + resource += [("conformsTo", res_attr)] + + if 'describedBy' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('describedBy')) + if res_attr: + resource += [("describedBy", res_attr)] + + if 'describedByType' in rkeys: + res_attr = JsonExportBuilder.strip_if_string(r.get('describedByType')) + if res_attr: + resource += [("describedByType", res_attr)] + + striped_resource = [(x, y) for x, y in resource if y is not None and y != "" and y != []] + + arr += [OrderedDict(striped_resource)] + + return arr + + @staticmethod + def get_contact_point(extras): + for required_field in ["contact_name", "contact_email"]: + if required_field not in extras.keys(): + raise KeyError(required_field) + + fn = JsonExportBuilder.strip_if_string(extras['contact_name']) + if fn is None: + raise KeyError('contact_name') + + email = JsonExportBuilder.strip_if_string(extras['contact_email']) + if email is None: + raise KeyError('contact_email') + + if '[[REDACTED' not in email: + if '@' not in email: + raise KeyError('contact_email') + else: + email = 'mailto:' + email + + contact_point = OrderedDict([ + ('@type', 'vcard:Contact'), # optional + ('fn', fn), # required + ('hasEmail', email), # required + ]) + return contact_point + + @staticmethod + def extra(package, key, default=None): + # Retrieves the value of an extras field. + for xtra in package["extras"]: + if xtra["key"] == key: + return xtra["value"] + return default + + @staticmethod + def get_publisher_tree_wrong_order(extras): + global currentPackageOrg + publisher = JsonExportBuilder.strip_if_string(extras.get('publisher')) + if publisher is None: + return None + # raise KeyError('publisher') + + currentPackageOrg = publisher + + organization_list = list() + organization_list.append([ + ('@type', 'org:Organization'), # optional + ('name', publisher), # required + ]) + + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + organization_list.append([ + ('@type', 'org:Organization'), # optional + ('name', JsonExportBuilder.strip_if_string(extras[key])), # required + ]) + currentPackageOrg = extras[key] + + size = len(organization_list) + + # [OSCIT, GSA] + # organization_list.reverse() + # [GSA, OSCIT] + + tree = False + for i in range(0, size): + if tree: + organization_list[i] += [('subOrganizationOf', OrderedDict(tree))] + tree = organization_list[i] + + return OrderedDict(tree) + + @staticmethod + def underscore_to_camelcase(value): + """ + Convert underscored strings to camel case, e.g. one_two_three to oneTwoThree + """ + + def camelcase(): + yield unicode.lower + while True: + yield unicode.capitalize + + c = camelcase() + return "".join(c.next()(x) if x else '_' for x in value.split("_")) + + @staticmethod + def get_best_resource(package, acceptable_formats): + resources = list(r for r in package["resources"] if r["format"].lower() in acceptable_formats) + if len(resources) == 0: return {} + resources.sort(key=lambda r: acceptable_formats.index(r["format"].lower())) + return resources[0] + + @staticmethod + def strip_if_string(val): + if isinstance(val, (str, unicode)): + val = val.strip() + if '' == val: + val = None + return val + + @staticmethod + def get_primary_resource(package): + # Return info about a "primary" resource. Select a good one. + return JsonExportBuilder.get_best_resource(package, ("csv", "xls", "xml", "text", "zip", "rdf")) + + @staticmethod + def get_api_resource(package): + # Return info about an API resource. + return JsonExportBuilder.get_best_resource(package, ("api", "query tool")) + + @staticmethod + def split_multiple_entries(retlist, extras, names): + found_element = string.strip(extras.get(names[1], "")) + if found_element: + retlist.append( + (names[0], [string.strip(x) for x in string.split(found_element, ',')]) + ) diff --git a/ckanext/datajson/build_datajsonld.py b/ckanext/datajson/build_datajsonld.py index 9f0c073d..8a223912 100644 --- a/ckanext/datajson/build_datajsonld.py +++ b/ckanext/datajson/build_datajsonld.py @@ -33,14 +33,18 @@ def distribution_to_jsonld(distribution): "description": "dcterms:description", "keyword": "dcat:keyword", "modified": "dcterms:modified", - "publisher": "dcat:publisher", - "person": "foaf:Person", + "publisher": "dcterms:publisher", + "contactPoint": "dcat:contactPoint", "mbox": "foaf:mbox", "identifier": "dcterms:identifier", + "accessLevel": "pod:accessLevel", + "bureauCode": "pod:bureauCode", + "programCode": "pod:programCode", + "accessLevelComment": "pod:accessLevelComment", "dataDictionary": "dcat:dataDictionary", "accessURL": "dcat:accessURL", - "webService": "dcat:webService", + "webService": "pod:webService", "format": "dcterms:format", # must be a dcterms:MediaTypeOrExtent "license": "dcterms:license", "spatial": "dcterms:spatial", # must be a dcterms:Location entity @@ -49,19 +53,16 @@ def distribution_to_jsonld(distribution): "issued": "dcterms:issued", "accrualPeriodicity": "dcterms:accrualPeriodicity", # must be a dcterms:Frequency "language": "dcat:language", # must be an IRI - "granularity": "dcat:granularity", - "dataQuality": "xsd:boolean", + "dataQuality": "pod:dataQuality", "theme": "dcat:theme", "references": "dcterms:references", - "size": "dcat:size", "landingPage": "dcat:landingPage", - "feed": "dcat:feed", + "systemOfRecords": "pod:systemOfRecords", } jsonld_metadata_datatypes = { "modified": "http://www.w3.org/2001/XMLSchema#dateTime", "issued": "http://www.w3.org/2001/XMLSchema#dateTime", - "size": "http://www.w3.org/2001/XMLSchema#decimal", } def apply_jsonld_metadata_mapping(data, newdict): @@ -72,10 +73,6 @@ def apply_jsonld_metadata_mapping(data, newdict): # skip fields with no mapping to RDF if k not in jsonld_metadata_mapping: continue - # specially handle 'keyword' which in JSON is packed in a comma-separated field - if k == "keyword": - v = v.split(",") - # specially handle literal fields with datatypes if k in jsonld_metadata_datatypes: # Convert ISO datetime format to xsd:dateTime format. diff --git a/ckanext/datajson/datajsonvalidator.py b/ckanext/datajson/datajsonvalidator.py index b1102a0b..28739f9e 100644 --- a/ckanext/datajson/datajsonvalidator.py +++ b/ckanext/datajson/datajsonvalidator.py @@ -76,13 +76,19 @@ r'(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$' ) +REDACTED_REGEX = re.compile( + r'^(\[\[REDACTED).*?(\]\])$' +) + # load the OMB bureau codes on first load of this module -import urllib, csv +import urllib +import csv omb_burueau_codes = set() for row in csv.DictReader(urllib.urlopen("https://project-open-data.cio.gov/data/omb_bureau_codes.csv")): omb_burueau_codes.add(row["Agency Code"] + ":" + row["Bureau Code"]) + # main function for validation def do_validation(doc, errors_array): errs = {} @@ -136,14 +142,15 @@ def do_validation(doc, errors_array): # contactPoint - hasEmail # required if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs): - import lepl.apps.rfc3696 + if not is_redacted(cp.get('hasEmail')): + import lepl.apps.rfc3696 - email_validator = lepl.apps.rfc3696.Email() - email = cp["hasEmail"].replace('mailto:', '') - if not email_validator(email): - add_error(errs, 5, "Invalid Required Field Value", - "The email address \"%s\" is not a valid email address." % email, - dataset_name) + email_validator = lepl.apps.rfc3696.Email() + email = cp["hasEmail"].replace('mailto:', '') + if not email_validator(email): + add_error(errs, 5, "Invalid Required Field Value", + "The email address \"%s\" is not a valid email address." % email, + dataset_name) # description # required check_required_string_field(item, "description", 1, dataset_name, errs) @@ -158,8 +165,9 @@ def do_validation(doc, errors_array): # keyword # required if isinstance(item.get("keyword"), (str, unicode)): - add_error(errs, 5, "Update Your File!", - "The keyword field used to be a string but now it must be an array.", dataset_name) + if not is_redacted(item.get("keyword")): + add_error(errs, 5, "Update Your File!", + "The keyword field used to be a string but now it must be an array.", dataset_name) elif check_required_field(item, "keyword", list, dataset_name, errs): for kw in item["keyword"]: if not isinstance(kw, (str, unicode)): @@ -171,7 +179,8 @@ def do_validation(doc, errors_array): # modified # required if check_required_string_field(item, "modified", 1, dataset_name, errs): - if not MODIFIED_REGEX_1.match(item['modified']) \ + if not is_redacted(item['modified']) \ + and not MODIFIED_REGEX_1.match(item['modified']) \ and not MODIFIED_REGEX_2.match(item['modified']) \ and not MODIFIED_REGEX_3.match(item['modified']): add_error(errs, 5, "Invalid Required Field Value", @@ -195,8 +204,8 @@ def do_validation(doc, errors_array): # Required-If-Applicable # dataQuality # Required-If-Applicable - if item.get("dataQuality") is None: - pass # not required + if item.get("dataQuality") is None or is_redacted(item.get("dataQuality")): + pass # not required or REDACTED elif not isinstance(item["dataQuality"], bool): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'dataQuality' must be true or false, " @@ -207,35 +216,42 @@ def do_validation(doc, errors_array): if item.get("distribution") is None: pass # not required elif not isinstance(item["distribution"], list): - add_error(errs, 50, "Invalid Field Value (Optional Fields)", - "The field 'distribution' must be an array, if present.", dataset_name) + if isinstance(item["distribution"], (str, unicode)) and is_redacted(item.get("distribution")): + pass + else: + add_error(errs, 50, "Invalid Field Value (Optional Fields)", + "The field 'distribution' must be an array, if present.", dataset_name) else: for j, dt in enumerate(item["distribution"]): + if isinstance(dt, (str, unicode)): + if is_redacted(dt): + continue distribution_name = dataset_name + (" distribution %d" % (j + 1)) # distribution - downloadURL # Required-If-Applicable - check_url_field(False, dt, "downloadURL", distribution_name, errs) + check_url_field(False, dt, "downloadURL", distribution_name, errs, True) # distribution - mediaType # Required-If-Applicable if 'downloadURL' in dt: if check_required_string_field(dt, "mediaType", 1, distribution_name, errs): - if not IANA_MIME_REGEX.match(dt["mediaType"]): + if not IANA_MIME_REGEX.match(dt["mediaType"]) \ + and not is_redacted(dt["mediaType"]): add_error(errs, 5, "Invalid Field Value", "The distribution mediaType \"%s\" is invalid. " "It must be in IANA MIME format." % dt["mediaType"], distribution_name) # distribution - accessURL # optional - check_url_field(False, dt, "accessURL", distribution_name, errs) + check_url_field(False, dt, "accessURL", distribution_name, errs, True) # distribution - conformsTo # optional - check_url_field(False, dt, "conformsTo", distribution_name, errs) + check_url_field(False, dt, "conformsTo", distribution_name, errs, True) # distribution - describedBy # optional - check_url_field(False, dt, "describedBy", distribution_name, errs) + check_url_field(False, dt, "describedBy", distribution_name, errs, True) # distribution - describedByType # optional - if dt.get("describedByType") is None: - pass # not required + if dt.get("describedByType") is None or is_redacted(dt.get("describedByType")): + pass # not required or REDACTED elif not IANA_MIME_REGEX.match(dt["describedByType"]): add_error(errs, 5, "Invalid Field Value", "The describedByType \"%s\" is invalid. " @@ -255,7 +271,7 @@ def do_validation(doc, errors_array): check_required_string_field(dt, "title", 1, distribution_name, errs) # license # Required-If-Applicable - check_url_field(False, item, "license", dataset_name, errs) + check_url_field(False, item, "license", dataset_name, errs, True) # rights # Required-If-Applicable # TODO move to warnings @@ -269,8 +285,8 @@ def do_validation(doc, errors_array): "The field 'spatial' must be a string value if specified.", dataset_name) # temporal # Required-If-Applicable - if item.get("temporal") is None: - pass # not required + if item.get("temporal") is None or is_redacted(item.get("temporal")): + pass # not required or REDACTED elif not isinstance(item["temporal"], (str, unicode)): add_error(errs, 10, "Invalid Field Value (Optional Fields)", "The field 'temporal' must be a string value if specified.", dataset_name) @@ -286,19 +302,20 @@ def do_validation(doc, errors_array): # Expanded Fields # accrualPeriodicity # optional - if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES: + if item.get("accrualPeriodicity") not in ACCRUAL_PERIODICITY_VALUES \ + and not is_redacted(item.get("accrualPeriodicity")): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'accrualPeriodicity' had an invalid value.", dataset_name) # conformsTo # optional - check_url_field(False, item, "conformsTo", dataset_name, errs) + check_url_field(False, item, "conformsTo", dataset_name, errs, True) # describedBy # optional - check_url_field(False, item, "describedBy", dataset_name, errs) + check_url_field(False, item, "describedBy", dataset_name, errs, True) # describedByType # optional - if item.get("describedByType") is None: - pass # not required + if item.get("describedByType") is None or is_redacted(item.get("describedByType")): + pass # not required or REDACTED elif not IANA_MIME_REGEX.match(item["describedByType"]): add_error(errs, 5, "Invalid Field Value", "The describedByType \"%s\" is invalid. " @@ -310,29 +327,29 @@ def do_validation(doc, errors_array): check_required_string_field(item, "isPartOf", 1, dataset_name, errs) # issued # optional - if item.get("issued") is not None: + if item.get("issued") is not None and not is_redacted(item.get("issued")): if not ISSUED_REGEX.match(item['issued']): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'issued' is not in a valid format.", dataset_name) # landingPage # optional - check_url_field(False, item, "landingPage", dataset_name, errs) + check_url_field(False, item, "landingPage", dataset_name, errs, True) # language # optional - if item.get("language") is None: - pass # not required + if item.get("language") is None or is_redacted(item.get("language")): + pass # not required or REDACTED elif not isinstance(item["language"], list): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' must be an array, if present.", dataset_name) else: for s in item["language"]: - if not LANGUAGE_REGEX.match(s): + if not LANGUAGE_REGEX.match(s) and not is_redacted(s): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'language' had an invalid language: \"%s\"" % s, dataset_name) # PrimaryITInvestmentUII # optional - if item.get("PrimaryITInvestmentUII") is None: - pass # not required + if item.get("PrimaryITInvestmentUII") is None or is_redacted(item.get("PrimaryITInvestmentUII")): + pass # not required or REDACTED elif not PRIMARY_IT_INVESTMENT_UII_REGEX.match(item["PrimaryITInvestmentUII"]): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'PrimaryITInvestmentUII' must be a string " @@ -340,13 +357,16 @@ def do_validation(doc, errors_array): # references # optional if item.get("references") is None: - pass # not required + pass # not required or REDACTED elif not isinstance(item["references"], list): - add_error(errs, 50, "Invalid Field Value (Optional Fields)", - "The field 'references' must be an array, if present.", dataset_name) + if isinstance(item["references"], (str, unicode)) and is_redacted(item.get("references")): + pass + else: + add_error(errs, 50, "Invalid Field Value (Optional Fields)", + "The field 'references' must be an array, if present.", dataset_name) else: for s in item["references"]: - if not URL_REGEX.match(s): + if not URL_REGEX.match(s) and not is_redacted(s): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'references' had an invalid URL: \"%s\"" % s, dataset_name) @@ -354,8 +374,8 @@ def do_validation(doc, errors_array): check_url_field(False, item, "systemOfRecords", dataset_name, errs) # theme #optional - if item.get("theme") is None: - pass # not required + if item.get("theme") is None or is_redacted(item.get("theme")): + pass # not required or REDACTED elif not isinstance(item["theme"], list): add_error(errs, 50, "Invalid Field Value (Optional Fields)", "The field 'theme' must be an array.", dataset_name) @@ -374,7 +394,7 @@ def do_validation(doc, errors_array): err_type[1], # heading [err_item + (" (%d locations)" % len(errs[err_type][err_item]) if len(errs[err_type][err_item]) else "") for err_item in sorted(errs[err_type], key=lambda x: (-len(errs[err_type][x]), x)) - ])) + ])) def add_error(errs, severity, heading, description, context=None): @@ -397,7 +417,7 @@ def check_required_field(obj, field_name, data_type, dataset_name, errs): add_error(errs, 10, "Missing Required Fields", "The '%s' field is missing." % field_name, dataset_name) return False elif obj[field_name] is None: - add_error(errs, 10, "Missing Required Fields", "The '%s' field is set to null." % field_name, dataset_name) + add_error(errs, 10, "Missing Required Fields", "The '%s' field is empty." % field_name, dataset_name) return False elif not isinstance(obj[field_name], data_type): add_error(errs, 5, "Invalid Required Field Value", @@ -426,11 +446,18 @@ def check_required_string_field(obj, field_name, min_length, dataset_name, errs) return True -def check_url_field(required, obj, field_name, dataset_name, errs): +def is_redacted(field): + if isinstance(field, (str, unicode)) and REDACTED_REGEX.match(field): + return True + return False + + +def check_url_field(required, obj, field_name, dataset_name, errs, allow_redacted=False): # checks that a required or optional field, if specified, looks like a URL if not required and (field_name not in obj or obj[field_name] is None): return True # not required, so OK if not check_required_field(obj, field_name, (str, unicode), dataset_name, errs): return False # just checking data type + if allow_redacted and is_redacted(obj[field_name]): return True if not URL_REGEX.match(obj[field_name]): add_error(errs, 5, "Invalid Required Field Value", "The '%s' field has an invalid URL: \"%s\"." % (field_name, obj[field_name]), dataset_name) diff --git a/ckanext/datajson/harvester_base.py b/ckanext/datajson/harvester_base.py index 89d4ffd6..d1cf4de0 100644 --- a/ckanext/datajson/harvester_base.py +++ b/ckanext/datajson/harvester_base.py @@ -1,23 +1,40 @@ from ckan.lib.base import c from ckan import model +from ckan import plugins as p from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.munge import munge_title_to_name from ckan.lib.search.index import PackageSearchIndex +from ckan.lib.navl.dictization_functions import Invalid +from ckan.lib.navl.validators import ignore_empty from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ - HarvestObjectError + HarvestObjectError, HarvestObjectExtra from ckanext.harvest.harvesters.base import HarvesterBase -import uuid, datetime, hashlib, urllib2, json, yaml +import uuid, datetime, hashlib, urllib2, json, yaml, json, os + +from jsonschema.validators import Draft4Validator +from jsonschema import FormatChecker import logging log = logging.getLogger("harvester") +VALIDATION_SCHEMA = [ + ('', 'Project Open Data (Federal)'), + ('non-federal', 'Project Open Data (Non-Federal)'), + ] + +def validate_schema(schema): + if schema not in [s[0] for s in VALIDATION_SCHEMA]: + raise Invalid('Unknown validation schema: {0}'.format(schema)) + return schema + class DatasetHarvesterBase(HarvesterBase): ''' A Harvester for datasets. ''' + _user_name = None # SUBCLASSES MUST IMPLEMENT #HARVESTER_VERSION = "1.0" @@ -34,13 +51,49 @@ def validate_config(self, config): config_obj = yaml.load(config) return config + def load_config(self, harvest_source): + # Load the harvest source's configuration data. We expect it to be a YAML + # string. Unfortunately I went ahead of CKAN on this. The stock CKAN harvester + # only allows JSON in the configuration box. My fork is necessary for this + # to work: https://github.com/joshdata/ckanext-harvest + + ret = { + "filters": { }, # map data.json field name to list of values one of which must be present + "defaults": { }, # map field name to value to supply as default if none exists, handled by the actual importer module, so the field names may be arbitrary + } + + source_config = yaml.load(harvest_source.config) + + try: + ret["filters"].update(source_config["filters"]) + except TypeError: + pass + except KeyError: + pass + + try: + ret["defaults"].update(source_config["defaults"]) + except TypeError: + pass + except KeyError: + pass + + return ret + + def _get_user_name(self): + if not self._user_name: + user = p.toolkit.get_action('get_site_user')({'model': model, 'ignore_auth': True}, {}) + self._user_name = user['name'] + + return self._user_name + def context(self): # Reusing the dict across calls to action methods can be dangerous, so # create a new dict every time we need it. # Setting validate to False is critical for getting the harvester plugin # to set extra fields on the package during indexing (see ckanext/harvest/plugin.py # line 99, https://github.com/okfn/ckanext-harvest/blob/master/ckanext/harvest/plugin.py#L99). - return { "user": "harvest", "ignore_auth": True, "validate": False } + return { "user": self._get_user_name(), "ignore_auth": True } # SUBCLASSES MUST IMPLEMENT def load_remote_catalog(self, harvest_job): @@ -49,6 +102,11 @@ def load_remote_catalog(self, harvest_job): # with a locally unique identifier string and a 'title' field. raise Exception("Not implemented") + def extra_schema(self): + return { + 'validator_schema': [ignore_empty, unicode, validate_schema], + } + def gather_stage(self, harvest_job): # The gather stage scans a remote resource (like a /data.json file) for # a list of datasets to import. @@ -56,32 +114,158 @@ def gather_stage(self, harvest_job): log.debug('In %s gather_stage (%s)' % (repr(self), harvest_job.source.url)) # Start gathering. - source = self.load_remote_catalog(harvest_job) - if len(source) == 0: return [] + try: + source_datasets, catalog_values = self.load_remote_catalog(harvest_job) + except ValueError as e: + self._save_gather_error("Error loading json content: %s." % (e), harvest_job) + return [] + + if len(source_datasets) == 0: return [] + + DATAJSON_SCHEMA = { + "https://project-open-data.cio.gov/v1.1/schema": '1.1', + } + + # schema version is default 1.0, or a valid one (1.1, ...) + schema_version = '1.0' + parent_identifiers = set() + child_identifiers = set() + catalog_extras = {} + if isinstance(catalog_values, dict): + schema_value = catalog_values.get('conformsTo', '') + if schema_value not in DATAJSON_SCHEMA.keys(): + self._save_gather_error('Error reading json schema value.' \ + ' The given value is %s.' % ('empty' if schema_value == '' + else schema_value), harvest_job) + return [] + schema_version = DATAJSON_SCHEMA.get(schema_value, '1.0') + + for dataset in source_datasets: + parent_identifier = dataset.get('isPartOf') + if parent_identifier: + parent_identifiers.add(parent_identifier) + child_identifiers.add(dataset.get('identifier')) + + # get a list of needed catalog values and put into hobj + catalog_fields = ['@context', '@id', 'conformsTo', 'describedBy'] + catalog_extras = dict(('catalog_'+k, v) + for (k, v) in catalog_values.iteritems() + if k in catalog_fields) # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_identifier, # which corresponds to the remote catalog's 'identifier' field. # Make a mapping so we know how to update existing records. + # Added: mark all existing parent datasets. existing_datasets = { } + existing_parents = { } for hobj in model.Session.query(HarvestObject).filter_by(source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue - sid = self.find_extra(pkg, "source_identifier") + sid = self.find_extra(pkg, "identifier") + is_parent = self.find_extra(pkg, "collection_metadata") if sid: existing_datasets[sid] = pkg + if is_parent and pkg.get("state") == "active": + existing_parents[sid] = pkg + + # which parents has been demoted to child level? + existing_parents_demoted = set( + identifier for identifier in existing_parents.keys() \ + if identifier not in parent_identifiers) + + # if there is any new parents, we will have to harvest parents + # first, mark the status in harvest_source config, which + # triggers a children harvest_job after parents job is finished. + source = harvest_job.source + source_config = json.loads(source.config or '{}') + # run status: None, or parents_run, or children_run? + run_status = source_config.get('datajson_collection') + if parent_identifiers: + for parent in parent_identifiers & child_identifiers: + self._save_gather_error("Collection identifier '%s' \ + cannot be isPartOf another collection." \ + % parent, harvest_job) + + new_parents = set(identifier for identifier in parent_identifiers \ + if identifier not in existing_parents.keys()) + if new_parents: + if not run_status: + # fresh start + run_status = 'parents_run' + source_config['datajson_collection'] = run_status + source.config = json.dumps(source_config) + source.save() + elif run_status == 'children_run': + # it means new parents are tried and failed. + # but skip some which have previously reported with + # parent_identifiers & child_identifiers + for parent in new_parents - \ + (parent_identifiers & child_identifiers): + self._save_gather_error("Collection identifier '%s' \ + not found. Records which are part of this \ + collection will not be harvested." \ + % parent, harvest_job) + else: + # run_status was parents_run, and did not finish. + # something wrong but not sure what happened. + # let's leave it as it is, let it run one more time. + pass + else: + # all parents are already in place. run it as usual. + run_status = None + elif run_status: + # need to clear run_status + run_status = None + source_config['datajson_collection'] = run_status + source.config = json.dumps(source_config) + source.save() # Create HarvestObjects for any records in the remote catalog. object_ids = [] seen_datasets = set() + unique_datasets = set() - for dataset in source: + filters = self.load_config(harvest_job.source)["filters"] + + for dataset in source_datasets: # Create a new HarvestObject for this dataset and save the # dataset metdata inside it for later. + + # Check the config's filters to see if we should import this dataset. + # For each filter, check that the value specified in the data.json file + # is among the permitted values in the filter specification. + matched_filters = True + for k, v in filters.items(): + if dataset.get(k) not in v: + matched_filters = False + if not matched_filters: + continue + + if parent_identifiers and new_parents \ + and dataset['identifier'] not in parent_identifiers \ + and dataset.get('isPartOf') in new_parents: + if run_status == 'parents_run': + # skip those whose parents still need to run. + continue + else: + # which is 'children_run'. + # error out since parents got issues. + self._save_gather_error( + "Record with identifier '%s': isPartOf '%s' points to \ + an erroneous record." % (dataset['identifier'], + dataset.get('isPartOf')), harvest_job) + continue + + # Some source contains duplicate identifiers. skip all except the first one + if dataset['identifier'] in unique_datasets: + self._save_gather_error("Duplicate entry ignored for identifier: '%s'." % (dataset['identifier']), harvest_job) + continue + unique_datasets.add(dataset['identifier']) # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the @@ -96,7 +280,8 @@ def gather_stage(self, harvest_job): # in the package so we can avoid updating datasets that # don't look like they've changed. if pkg.get("state") == "active" \ - and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source): + and dataset['identifier'] not in existing_parents_demoted \ + and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source, catalog_extras, schema_version): continue else: pkg_id = uuid.uuid4().hex @@ -104,9 +289,22 @@ def gather_stage(self, harvest_job): # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the remote catalog file. + extras = [HarvestObjectExtra( + key='schema_version', value=schema_version)] + if dataset['identifier'] in parent_identifiers: + extras.append(HarvestObjectExtra( + key='is_collection', value=True)) + elif dataset.get('isPartOf'): + parent_pkg_id = existing_parents[dataset.get('isPartOf')]['id'] + extras.append(HarvestObjectExtra( + key='collection_pkg_id', value=parent_pkg_id)) + for k, v in catalog_extras.iteritems(): + extras.append(HarvestObjectExtra(key=k, value=v)) + obj = HarvestObject( guid=pkg_id, job=harvest_job, + extras=extras, content=json.dumps(dataset, sort_keys=True)) # use sort_keys to preserve field order so hashes of this string are constant from run to run obj.save() object_ids.append(obj.id) @@ -116,9 +314,14 @@ def gather_stage(self, harvest_job): if upstreamid in seen_datasets: continue # was just updated if pkg.get("state") == "deleted": continue # already deleted pkg["state"] = "deleted" - pkg["name"] = self.make_package_name(pkg["title"], pkg["id"], True) # try to prevent name clash by giving it a "deleted-" name log.warn('deleting package %s (%s) because it is no longer in %s' % (pkg["name"], pkg["id"], harvest_job.source.url)) get_action('package_update')(self.context(), pkg) + obj = HarvestObject( + guid=pkg_id, + job=harvest_job, + ) + obj.save() + object_ids.append(obj.id) return object_ids @@ -128,29 +331,217 @@ def fetch_stage(self, harvest_object): return True # SUBCLASSES MUST IMPLEMENT - def set_dataset_info(self, pkg, dataset, dataset_defaults): + def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version): # Sets package metadata on 'pkg' using the remote catalog's metadata # in 'dataset' and default values as configured in 'dataset_defaults'. raise Exception("Not implemented.") + # validate dataset against POD schema + # use a local copy. + def _validate_dataset(self, validator_schema, schema_version, dataset): + if validator_schema == 'non-federal': + if schema_version == '1.1': + file_path = 'pod_schema/non-federal-v1.1/dataset-non-federal.json' + else: + file_path = 'pod_schema/non-federal/single_entry.json' + else: + if schema_version == '1.1': + file_path = 'pod_schema/federal-v1.1/dataset.json' + else: + file_path = 'pod_schema/single_entry.json' + + with open(os.path.join( + os.path.dirname(__file__), file_path)) as json_file: + schema = json.load(json_file) + + msg = ";" + errors = Draft4Validator(schema, format_checker=FormatChecker()).iter_errors(dataset) + count = 0 + for error in errors: + count += 1 + msg = msg + " ### ERROR #" + str(count) + ": " + self._validate_readable_msg(error) + "; " + msg = msg.strip("; ") + if msg: + id = "Identifier: " + (dataset.get("identifier") if dataset.get("identifier") else "Unknown") + title = "Title: " + (dataset.get("title") if dataset.get("title") else "Unknown") + msg = id + "; " + title + "; " + str(count) + " Error(s) Found. " + msg + "." + return msg + + # make ValidationError readable. + def _validate_readable_msg(self, e): + msg = e.message.replace("u'", "'") + elem = "" + try: + if e.schema_path[0] == 'properties': + elem = e.schema_path[1] + elem = "'" + elem + "':" + except: + pass + + return elem + msg + def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In %s import_stage' % repr(self)) - # Get default values. - source_config = yaml.load(harvest_object.source.config) - dataset_defaults = None - try: - dataset_defaults = source_config["defaults"] - except TypeError: - pass - except KeyError: - pass - if not dataset_defaults: dataset_defaults = { } - - # Get the metadata that we stored in the HarvestObject's content field. + if(harvest_object.content == None): + return True + dataset = json.loads(harvest_object.content) + schema_version = '1.0' # default to '1.0' + is_collection = False + parent_pkg_id = '' + catalog_extras = {} + for extra in harvest_object.extras: + if extra.key == 'schema_version': + schema_version = extra.value + if extra.key == 'is_collection' and extra.value: + is_collection = True + if extra.key == 'collection_pkg_id' and extra.value: + parent_pkg_id = extra.value + if extra.key.startswith('catalog_'): + catalog_extras[extra.key] = extra.value + + # if this dataset is part of collection, we need to check if + # parent dataset exist or not. we dont support any hierarchy + # in this, so the check does not apply to those of is_collection + if parent_pkg_id and not is_collection: + parent_pkg = None + try: + parent_pkg = get_action('package_show')(self.context(), + { "id": parent_pkg_id }) + except: + pass + if not parent_pkg: + parent_check_message = "isPartOf identifer '%s' not found." \ + % dataset.get('isPartOf') + self._save_object_error(parent_check_message, harvest_object, + 'Import') + return None + + # Get default values. + dataset_defaults = self.load_config(harvest_object.source)["defaults"] + + source_config = json.loads(harvest_object.source.config or '{}') + validator_schema = source_config.get('validator_schema') + if schema_version == '1.0' and validator_schema != 'non-federal': + lowercase_conversion = True + else: + lowercase_conversion = False + + MAPPING = { + "title": "title", + "description": "notes", + "keyword": "tags", + "modified": "extras__modified", # ! revision_timestamp + "publisher": "extras__publisher", # !owner_org + "contactPoint": "maintainer", + "mbox": "maintainer_email", + "identifier": "extras__identifier", # !id + "accessLevel": "extras__accessLevel", + + "bureauCode": "extras__bureauCode", + "programCode": "extras__programCode", + "accessLevelComment": "extras__accessLevelComment", + "license": "extras__license", # !license_id + "spatial": "extras__spatial", # Geometry not valid GeoJSON, not indexing + "temporal": "extras__temporal", + + "theme": "extras__theme", + "dataDictionary": "extras__dataDictionary", # !data_dict + "dataQuality": "extras__dataQuality", + "accrualPeriodicity":"extras__accrualPeriodicity", + "landingPage": "extras__landingPage", + "language": "extras__language", + "primaryITInvestmentUII": "extras__primaryITInvestmentUII", # !PrimaryITInvestmentUII + "references": "extras__references", + "issued": "extras__issued", + "systemOfRecords": "extras__systemOfRecords", + + "accessURL": None, + "webService": None, + "format": None, + "distribution": None, + } + + MAPPING_V1_1 = { + "title": "title", + "description": "notes", + "keyword": "tags", + "modified": "extras__modified", # ! revision_timestamp + "publisher": "extras__publisher", # !owner_org + "contactPoint": {"fn":"maintainer", "hasEmail":"maintainer_email"}, + "identifier": "extras__identifier", # !id + "accessLevel": "extras__accessLevel", + + "bureauCode": "extras__bureauCode", + "programCode": "extras__programCode", + "rights": "extras__rights", + "license": "extras__license", # !license_id + "spatial": "extras__spatial", # Geometry not valid GeoJSON, not indexing + "temporal": "extras__temporal", + + "theme": "extras__theme", + "dataDictionary": "extras__dataDictionary", # !data_dict + "dataQuality": "extras__dataQuality", + "accrualPeriodicity":"extras__accrualPeriodicity", + "landingPage": "extras__landingPage", + "language": "extras__language", + "primaryITInvestmentUII": "extras__primaryITInvestmentUII", # !PrimaryITInvestmentUII + "references": "extras__references", + "issued": "extras__issued", + "systemOfRecords": "extras__systemOfRecords", + + "distribution": None, + } + + SKIP = ["accessURL", "webService", "format", "distribution"] # will go into pkg["resources"] + # also skip the processed_how key, it was added to indicate how we processed the dataset. + SKIP.append("processed_how"); + + SKIP_V1_1 = ["@type", "isPartOf", "distribution"] + SKIP_V1_1.append("processed_how"); + + if lowercase_conversion: + + mapping_processed = {} + for k,v in MAPPING.items(): + mapping_processed[k.lower()] = v + + skip_processed = [k.lower() for k in SKIP] + + dataset_processed = {'processed_how': ['lowercase']} + for k,v in dataset.items(): + if k.lower() in mapping_processed.keys(): + dataset_processed[k.lower()] = v + else: + dataset_processed[k] = v + + if 'distribution' in dataset and dataset['distribution'] is not None: + dataset_processed['distribution'] = [] + for d in dataset['distribution']: + d_lower = {} + for k,v in d.items(): + if k.lower() in mapping_processed.keys(): + d_lower[k.lower()] = v + else: + d_lower[k] = v + dataset_processed['distribution'].append(d_lower) + else: + dataset_processed = dataset + mapping_processed = MAPPING + skip_processed = SKIP + + if schema_version == '1.1': + mapping_processed = MAPPING_V1_1 + skip_processed = SKIP_V1_1 + + validate_message = self._validate_dataset(validator_schema, + schema_version, dataset_processed) + if validate_message: + self._save_object_error(validate_message, harvest_object, 'Import') + return None # We need to get the owner organization (if any) from the harvest # source dataset @@ -158,40 +549,111 @@ def import_stage(self, harvest_object): source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: owner_org = source_dataset.owner_org - + + + source_config = json.loads(harvest_object.source.config or '{}') + group_name = source_config.get('default_groups', '') + # Assemble basic information about the dataset. + pkg = { - "name": self.make_package_name(dataset["title"], harvest_object.guid, False), "state": "active", # in case was previously deleted "owner_org": owner_org, - "extras": [{ - "key": "source_url", - "value": harvest_object.source.url, + "groups": [{"name": group_name}], + "resources": [], + "extras": [ + { + "key": "resource-type", + "value": "Dataset", }, { - "key": "source_title", - "value": harvest_object.source.title, + "key": "source_hash", + "value": self.make_upstream_content_hash(dataset, harvest_object.source, catalog_extras, schema_version), }, { - "key": "source_identifier", - "value": dataset["identifier"], + "key": "source_datajson_identifier", + "value": True, }, { - "key": "source_hash", - "value": self.make_upstream_content_hash(dataset, harvest_object.source), + "key": "harvest_source_id", + "value": harvest_object.harvest_source_id, }, { - "key": "harvest_harvester_version", - "value": self.HARVESTER_VERSION, + "key": "harvest_object_id", + "value": harvest_object.id, }, { - "key": "harvest_last_updated", - "value": datetime.datetime.utcnow().isoformat(), - }] + "key": "harvest_source_title", + "value": harvest_object.source.title, + }, + { + "key": "source_schema_version", + "value": schema_version, + }, + ] } - + + extras = pkg["extras"] + unmapped = [] + + for key, value in dataset_processed.iteritems(): + if key in skip_processed: + continue + new_key = mapping_processed.get(key) + if not new_key: + unmapped.append(key) + continue + + # after schema 1.0+, we need to deal with multiple new_keys + new_keys = [] + values = [] + if isinstance(new_key, dict): # when schema is not 1.0 + _new_key_keys = new_key.keys() + new_keys = new_key.values() + values = [] + for _key in _new_key_keys: + values.append(value.get(_key)) + else: + new_keys.append(new_key) + values.append(value) + + if not any(item for item in values): + continue + + mini_dataset = dict(zip(new_keys, values)) + for mini_key, mini_value in mini_dataset.iteritems(): + if not mini_value: + continue + if mini_key.startswith('extras__'): + extras.append({"key": mini_key[8:], "value": mini_value}) + else: + pkg[mini_key] = mini_value + + # pick a fix number of unmapped entries and put into extra + if unmapped: + unmapped.sort() + del unmapped[100:] + for key in unmapped: + value = dataset_processed.get(key, "") + if value is not None: extras.append({"key": key, "value": value}) + + # if theme is geospatial/Geospatial, we tag it in metadata_type. + themes = self.find_extra(pkg, "theme") + if themes and ('geospatial' in [x.lower() for x in themes]): + extras.append({'key':'metadata_type', 'value':'geospatial'}) + + if is_collection: + extras.append({'key':'collection_metadata', 'value':'true'}) + elif parent_pkg_id: + extras.append( + {'key':'collection_package_id', 'value':parent_pkg_id} + ) + + for k, v in catalog_extras.iteritems(): + extras.append({'key':k, 'value':v}) + # Set specific information about the dataset. - self.set_dataset_info(pkg, dataset, dataset_defaults) + self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. @@ -209,7 +671,7 @@ def import_stage(self, harvest_object): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] - + pkg['groups'] = existing_pkg['groups'] existing_pkg.update(pkg) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg @@ -217,6 +679,7 @@ def import_stage(self, harvest_object): pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. + pkg['name'] = self.make_package_name(dataset_processed["title"], harvest_object.guid) try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) @@ -243,9 +706,16 @@ def import_stage(self, harvest_object): return True - def make_upstream_content_hash(self, datasetdict, harvest_source): - return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) - + "|" + harvest_source.config + "|" + self.HARVESTER_VERSION).hexdigest() + def make_upstream_content_hash(self, datasetdict, harvest_source, + catalog_extras, schema_version='1.0'): + if schema_version == '1.0': + return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) + + "|" + harvest_source.config + "|" + + self.HARVESTER_VERSION).hexdigest() + else: + return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) + + "|" + json.dumps(catalog_extras, + sort_keys=True)).hexdigest() def find_extra(self, pkg, key): for extra in pkg["extras"]: @@ -253,7 +723,7 @@ def find_extra(self, pkg, key): return extra["value"] return None - def make_package_name(self, title, exclude_existing_package, for_deletion): + def make_package_name(self, title, exclude_existing_package): ''' Creates a URL friendly name from a title @@ -261,13 +731,29 @@ def make_package_name(self, title, exclude_existing_package, for_deletion): ''' name = munge_title_to_name(title).replace('_', '-') - if for_deletion: name = "deleted-" + name while '--' in name: name = name.replace('--', '-') name = name[0:90] # max length is 100 + + # Is this slug already in use (and if we're updating a package, is it in + # use by a different package?). pkg_obj = Session.query(Package).filter(Package.name == name).filter(Package.id != exclude_existing_package).first() - if pkg_obj: - return name + "-" + str(uuid.uuid4())[:5] - else: + if not pkg_obj: + # The name is available, so use it. Note that if we're updating an + # existing package we will be updating this package's URL, so incoming + # links may break. return name - + + if exclude_existing_package: + # The name is not available, and we're updating a package. Chances + # are the package's name already had some random string attached + # to it last time. Prevent spurrious updates to the package's URL + # (choosing new random text) by just reusing the existing package's + # name. + pkg_obj = Session.query(Package).filter(Package.id == exclude_existing_package).first() + if pkg_obj: # the package may not exist yet because we may be passed the desired package GUID before a new package is instantiated + return pkg_obj.name + + # Append some random text to the URL. Hope that with five character + # there will be no collsion. + return name + "-" + str(uuid.uuid4())[:5] diff --git a/ckanext/datajson/harvester_cmsdatanavigator.py b/ckanext/datajson/harvester_cmsdatanavigator.py index 8ae0bef6..f364683d 100644 --- a/ckanext/datajson/harvester_cmsdatanavigator.py +++ b/ckanext/datajson/harvester_cmsdatanavigator.py @@ -7,7 +7,7 @@ class CmsDataNavigatorHarvester(DatasetHarvesterBase): A Harvester for the CMS Data Navigator catalog. ''' - HARVESTER_VERSION = "0.9aj" # increment to force an update even if nothing has changed + HARVESTER_VERSION = "0.9al" # increment to force an update even if nothing has changed def info(self): return { @@ -27,23 +27,26 @@ def set_dataset_info(self, package, dataset, dataset_defaults): extra(package, "Agency", "Department of Health & Human Services") package["author"] = "Centers for Medicare & Medicaid Services" extra(package, "author_id", "http://healthdata.gov/id/agency/cms") + extra(package, "Bureau Code", "009:38") package["title"] = dataset["Name"].strip() package["notes"] = dataset.get("Description") package["url"] = dataset.get("Address") - extra(package, "Date Released", parsedate(dataset["HealthData"].get("DateReleased"))) - extra(package, "Date Updated", parsedate(dataset["HealthData"].get("DateUpdated"))) - extra(package, "Agency Program URL", dataset["HealthData"].get("AgencyProgramURL")) + + dataset_hd = dataset["HealthData"] + extra(package, "Date Released", parsedate(dataset_hd.get("DateReleased"))) + extra(package, "Date Updated", parsedate(dataset_hd.get("DateUpdated"))) + extra(package, "Agency Program URL", dataset_hd.get("AgencyProgramURL")) extra(package, "Subject Area 1", "Medicare") - extra(package, "Unit of Analysis", dataset["HealthData"].get("UnitOfAnalysis")) - extra(package, "Data Dictionary", dataset["HealthData"].get("DataDictionaryURL")) - extra(package, "Coverage Period", dataset["HealthData"].get("Coverage Period")) - extra(package, "Collection Frequency", dataset["HealthData"].get("Collection Frequency")) - extra(package, "Geographic Scope", dataset["HealthData"].get("GeographicScope")) - #extra(package, "Contact Person", dataset["HealthData"].get("ContactName")) # not in HHS schema - #extra(package, "Contact Email", dataset["HealthData"].get("ContactEmail")) # not in HHS schema - extra(package, "License Agreement", dataset["HealthData"].get("DataLicenseAgreementURL")) - + extra(package, "Unit of Analysis", dataset_hd.get("UnitOfAnalysis")) + extra(package, "Data Dictionary", dataset_hd.get("DataDictionaryURL")) + extra(package, "Coverage Period", dataset_hd.get("Coverage Period")) + extra(package, "Collection Frequency", dataset_hd.get("Collection Frequency")) + extra(package, "Geographic Scope", dataset_hd.get("GeographicScope")) + extra(package, "Contact Name", dataset_hd.get("GenericContactName", None) or dataset_hd.get("ContactName")) # 'X or Y' syntax returns Y if X is either None or the empty string + extra(package, "Contact Email", dataset_hd.get("GenericContactEmail", None) or dataset_hd.get("ContactEmail")) + extra(package, "License Agreement", dataset_hd.get("DataLicenseAgreementURL")) + from ckan.lib.munge import munge_title_to_name package["tags"] = [ { "name": munge_title_to_name(t["Name"]) } for t in dataset.get("Keywords", [])] diff --git a/ckanext/datajson/harvester_datajson.py b/ckanext/datajson/harvester_datajson.py index c72b3c90..67891c01 100644 --- a/ckanext/datajson/harvester_datajson.py +++ b/ckanext/datajson/harvester_datajson.py @@ -1,4 +1,6 @@ from ckanext.datajson.harvester_base import DatasetHarvesterBase +from parse_datajson import parse_datajson_entry + import urllib2, json @@ -7,7 +9,7 @@ class DataJsonHarvester(DatasetHarvesterBase): A Harvester for /data.json files. ''' - HARVESTER_VERSION = "0.9aj" # increment to force an update even if nothing has changed + HARVESTER_VERSION = "0.9al" # increment to force an update even if nothing has changed def info(self): return { @@ -17,10 +19,47 @@ def info(self): } def load_remote_catalog(self, harvest_job): - return json.load(urllib2.urlopen(harvest_job.source.url)) + req = urllib2.Request(harvest_job.source.url) + # todo: into config and across harvester + req.add_header('User-agent', 'Data.gov/2.0') + try: + datasets = json.load(urllib2.urlopen(req)) + except UnicodeDecodeError: + # try different encode + try: + datasets = json.load(urllib2.urlopen(req), 'cp1252') + except: + datasets = json.load(urllib2.urlopen(req), 'iso-8859-1') + except: + # remove BOM + datasets = json.loads(lstrip_bom(urllib2.urlopen(req).read())) + + # The first dataset should be for the data.json file itself. Check that + # it is, and if so rewrite the dataset's title because Socrata exports + # these items all with the same generic name that is confusing when + # harvesting a bunch from different sources. It should have an accessURL + # but Socrata fills the URL of these in under webService. + if isinstance(datasets, list) and len(datasets) > 0 and (datasets[0].get("accessURL") == harvest_job.source.url + or datasets[0].get("webService") == harvest_job.source.url) and \ + datasets[0].get("title") == "Project Open Data, /data.json file": + datasets[0]["title"] = "%s Project Open Data data.json File" % harvest_job.source.title + + catalog_values = None + if isinstance(datasets, dict): + # this is a catalog, not dataset array as in schema 1.0. + catalog_values = datasets.copy() + datasets = catalog_values.pop("dataset", []) + + return (datasets, catalog_values) - def set_dataset_info(self, pkg, dataset, dataset_defaults): - from parse_datajson import parse_datajson_entry - parse_datajson_entry(dataset, pkg, dataset_defaults) - + def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version): + parse_datajson_entry(dataset, pkg, dataset_defaults, schema_version) +# helper function to remove BOM +def lstrip_bom(str_): + from codecs import BOM_UTF8 + bom = BOM_UTF8 + if str_.startswith(bom): + return str_[len(bom):] + else: + return str_ diff --git a/ckanext/datajson/parse_datajson.py b/ckanext/datajson/parse_datajson.py index e624096c..63c6f5f3 100644 --- a/ckanext/datajson/parse_datajson.py +++ b/ckanext/datajson/parse_datajson.py @@ -1,81 +1,150 @@ +from ckan.lib.munge import munge_title_to_name + import re -def parse_datajson_entry(datajson, package, defaults): - package["title"] = datajson.get("title", defaults.get("Title")) - package["notes"] = datajson.get("description", defaults.get("Notes")) - package["tags"] = [ { "name": t } for t in - datajson.get("keyword", defaults.get("Tags", "")).split(",") if t.strip() != ""] - package["groups"] = [ { "name": g } for g in - defaults.get("Groups", [])] # the complexity of permissions makes this useless, CKAN seems to ignore - package["organization"] = datajson.get("organization", defaults.get("Organization")) - extra(package, "Group Name", defaults.get("Group Name")) # i.e. dataset grouping string - extra(package, "Date Updated", datajson.get("modified")) - extra(package, "Agency", defaults.get("Agency")) # i.e. federal department - package["publisher"] = datajson.get("publisher", defaults.get("Author")) # i.e. agency within HHS - extra(package, "author_id", defaults.get("author_id")) # i.e. URI for agency - extra(package, "Agency Program URL", defaults.get("Agency Program URL")) # i.e. URL for agency program - extra(package, "Contact Person", datajson.get("person")) # not in HHS schema - extra(package, "Contact Email", datajson.get("mbox")) # not in HHS schema - # "identifier" is handled by the harvester - extra(package, "Access Level", datajson.get("accessLevel")) # not in HHS schema - extra(package, "Data Dictionary", datajson.get("dataDictionary", defaults.get("Data Dictionary"))) - # accessURL is redundant with resources - # webService is redundant with resources - extra(package, "Format", datajson.get("format")) # not in HHS schema - extra(package, "License Agreement", datajson.get("license")) - #extra(package, "License Agreement Required", ...) - extra(package, "Geographic Scope", datajson.get("spatial")) - extra(package, "Temporal", datajson.get("temporal")) # HHS uses Coverage Period (FY) Start/End - extra(package, "Date Released", datajson.get("issued")) - #extra(package, "Collection Frequency", ...) - extra(package, "Publish Frequency", datajson.get("accrualPeriodicity")) # not in HHS schema - extra(package, "Language", datajson.get("language")) # not in HHS schema - extra(package, "Granularity", datajson.get("granularity")) # not in HHS schema - extra(package, "Data Quality Met", datajson.get("dataQuality")) # not in HHS schema - #extra(package, "Unit of Analysis", ...) - #extra(package, "Collection Instrument", ...) - extra(package, "Subject Area 1", datajson.get("theme", defaults.get("Subject Area 1"))) - extra(package, "Subject Area 2", defaults.get("Subject Area 2")) - extra(package, "Subject Area 2", defaults.get("Subject Area 3")) - extra(package, "Technical Documentation", datajson.get("references")) - extra(package, "Size", datajson.get("size")) # not in HHS schema - package["url"] = datajson.get("landingPage", datajson.get("webService", datajson.get("accessURL"))) - extra(package, "Feed", datajson.get("feed")) # not in HHS schema - extra(package, "System Of Records", datajson.get("systemOfRecords")) # not in HHS schema - package["resources"] = [ ] - for d in datajson.get("distribution", []): - for k in ("accessURL", "webService"): - if d.get(k, "").strip() != "": - r = { - "url": d[k], - "format": normalize_format(d.get("format", "Query Tool" if k == "webService" else "Unknown")), - } - extra(r, "Language", d.get("language")) - extra(r, "Size", d.get("size")) - - # work-around for Socrata-style formats array - try: - r["format"] = normalize_format(d["formats"][0]["label"]) - except: - pass - - r["name"] = r["format"] - - package["resources"].append(r) - +def parse_datajson_entry(datajson, package, defaults, schema_version): + # four fields need extra handling, which are + # 1.tag, 2.license, 3.maintainer_email, 4.publisher_hierarchy, + # 5.resources + + # 1. package["tags"] + package["tags"] = [ { "name": munge_title_to_name(t) } for t in + package.get("tags", "") if t.strip() != ""] + + # 2. package["license"] + licenses = { + 'Creative Commons Attribution':'cc-by', + 'Creative Commons Attribution Share-Alike':'cc-by-sa', + 'Creative Commons CCZero':'cc-zero', + 'Creative Commons Non-Commercial (Any)':'cc-nc', + 'GNU Free Documentation License':'gfdl', + 'License Not Specified':'notspecified', + 'Open Data Commons Attribution License':'odc-by', + 'Open Data Commons Open Database License (ODbL)':'odc-odbl', + 'Open Data Commons Public Domain Dedication and License (PDDL)':'odc-pddl', + 'Other (Attribution)':'other-at', + 'Other (Non-Commercial)':'other-nc', + 'Other (Not Open)':'other-closed', + 'Other (Open)':'other-open', + 'Other (Public Domain)':'other-pd', + 'UK Open Government Licence (OGL)':'uk-ogl', + } + + if not datajson.get("license", ""): + package["license_id"] = licenses.get("License Not Specified", ""); + elif licenses.get(datajson.get("license", ""), ""): + package["license_id"] = licenses.get(datajson.get("license", ""), "") + + # 3. package["maintainer_email"] + if package.get("maintainer_email"): + package["maintainer_email"] = \ + package.get("maintainer_email").replace("mailto:", "", 1) + + # 4. extras-publisher and extras-publisher_hierarchy + if schema_version == '1.1': + publisher = find_extra(package, "publisher", {}) + publisher_name = publisher.get("name", "") + set_extra(package, "publisher", publisher_name) + parent_publisher = publisher.get("subOrganizationOf", {}) + publisher_hierarchy = [] + while parent_publisher: + parent_name = parent_publisher.get("name", "") + parent_publisher = parent_publisher.get("subOrganizationOf", {}) + publisher_hierarchy.append(parent_name) + if publisher_hierarchy: + publisher_hierarchy.reverse() + publisher_hierarchy.append(publisher_name) + publisher_hierarchy = " > ".join(publisher_hierarchy) + set_extra(package, "publisher_hierarchy", publisher_hierarchy) + + # 5. package["resources"] + # if distribution is empty, assemble it with root level accessURL and format. + # but firstly it can be an ill-formated dict. + distribution = datajson.get("distribution", []) + if isinstance(distribution, dict): distribution = [distribution] + if not isinstance(distribution, list): distribution = [] + + downloadurl_key = "downloadURL" + acccessurl_key = "accessURL" + webservice_key = "webService" + if datajson.get("processed_how", []) and "lowercase" in datajson.get("processed_how", []): + acccessurl_key = acccessurl_key.lower() + webservice_key = webservice_key.lower() + + if not distribution: + for url in (acccessurl_key, webservice_key): + if datajson.get(url, "") and datajson.get(url, "").strip(): + d = { + url: datajson.get(url, ""), + "format": datajson.get("format", ""), + "mimetype": datajson.get("format", ""), + } + distribution.append(d) + + datajson["distribution"] = distribution + + for d in datajson.get("distribution", []): + downloadurl_value = d.get(downloadurl_key, "").strip() + accessurl_value = d.get(acccessurl_key, "").strip() + webservice_value = d.get(webservice_key, "").strip() + + which_value = (accessurl_value or webservice_value) if schema_version == '1.0' else (downloadurl_value or accessurl_value) + + if which_value: + r = {} + r['url'] = which_value + r['format'] = d.get("format", "") if schema_version == '1.0' else d.get("format", d.get("mediaType", "")) + r['mimetype'] = d.get("format", "") if schema_version == '1.0' else d.get("mediaType", "") + r['description'] = d.get('description', '') + r['name'] = d.get('title', '') + + # after schema 1.1+, we have some extra fields for resource + resource_extras = ['conformsTo', 'describedBy', 'describedByType'] + for resource_extra_key in resource_extras: + resource_extra_value = d.get(resource_extra_key) + if resource_extra_value: + r[resource_extra_key] = resource_extra_value + + # after schema 1.1+, include acccessurl if it is left over + if downloadurl_value and accessurl_value: + r['accessURL'] = accessurl_value + + package["resources"].append(r) + def extra(package, key, value): - if not value: return - package.setdefault("extras", []).append({ "key": key, "value": value }) - -def normalize_format(format): - # Format should be a file extension. But sometimes Socrata outputs a MIME type. - format = format.lower() - m = re.match(r"((application|text)/(\S+))(; charset=.*)?", format) - if m: - if m.group(1) == "text/plain": return "Text" - if m.group(1) == "application/zip": return "ZIP" - if m.group(1) == "application/vnd.ms-excel": return "XLS" - if m.group(1) == "application/x-msaccess": return "Access" - return "Other" - if format == "text": return "Text" - return format.upper() # hope it's one of our formats by converting to upprecase + if not value: return + package.setdefault("extras", []).append({ "key": key, "value": value }) + +def find_extra(pkg, key, default): + for extra in pkg["extras"]: + if extra["key"] == key: + ret = extra["value"] + break + else: + ret = default + + return ret + +def set_extra(pkg, key, value): + for extra in pkg["extras"]: + if extra["key"] == key: + extra["value"] = value + break + else: + pkg["extras"].append({"key":key, "value":value}) + +def normalize_format(format, raise_on_unknown=False): + if format is None: return + # Format should be a file extension. But sometimes Socrata outputs a MIME type. + format = format.lower() + m = re.match(r"((application|text)/(\S+))(; charset=.*)?", format) + if m: + if m.group(1) == "text/plain": return "Text" + if m.group(1) == "application/zip": return "ZIP" + if m.group(1) == "application/vnd.ms-excel": return "XLS" + if m.group(1) == "application/x-msaccess": return "Access" + if raise_on_unknown: raise ValueError() # caught & ignored by caller + return "Other" + if format == "text": return "Text" + if raise_on_unknown and "?" in format: raise ValueError() # weird value we should try to filter out; exception is caught & ignored by caller + return format.upper() # hope it's one of our formats by converting to upprecase \ No newline at end of file diff --git a/ckanext/datajson/plugin.py b/ckanext/datajson/plugin.py index 99d5d403..7320d3d4 100644 --- a/ckanext/datajson/plugin.py +++ b/ckanext/datajson/plugin.py @@ -1,38 +1,26 @@ -import ckan.plugins as p +import logging +import StringIO +import json +import ckan.plugins as p from ckan.lib.base import BaseController, render, c -import ckan.model as model from pylons import request, response +import re +import ckan.model as model import ckan.lib.dictization.model_dictize as model_dictize -import json, re -import logging from jsonschema.exceptions import best_match -import StringIO - -logger = logging.getLogger('datajson') -def get_validator(): - import os - from jsonschema import Draft4Validator, FormatChecker - - schema_path = os.path.join(os.path.dirname(__file__), 'schema', 'federal-v1.1', 'dataset.json') - with open(schema_path, 'r') as file: - schema = json.loads(file.read()) - return Draft4Validator(schema, format_checker=FormatChecker()) - - logger.warn('Unable to create validator') - return None - - -validator = get_validator() +logger = logging.getLogger('datajson') try: from collections import OrderedDict # 2.7 except ImportError: from sqlalchemy.util import OrderedDict -from build_datajson import make_datajson_entry, make_datajson_catalog +from build_datajson import JsonExportBuilder + +from build_datajson import make_datajson_entry, get_facet_fields # from build_enterprisedatajson import make_enterprisedatajson_entry from build_datajsonld import dataset_to_jsonld @@ -41,15 +29,14 @@ def get_validator(): class DataJsonPlugin(p.SingletonPlugin): p.implements(p.interfaces.IConfigurer) p.implements(p.interfaces.IRoutes, inherit=True) + p.implements(p.interfaces.IFacets) + + # IConfigurer def update_config(self, config): # Must use IConfigurer rather than IConfigurable because only IConfigurer # is called before after_map, in which we need the configuration directives # to know how to set the paths. - - # TODO commenting out enterprise data inventory for right now - # DataJsonPlugin.route_edata_path = config.get("ckanext.enterprisedatajson.path", "/enterprisedata.json") - DataJsonPlugin.route_enabled = config.get("ckanext.datajson.url_enabled", "True") == 'True' DataJsonPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") DataJsonPlugin.route_ld_path = config.get("ckanext.datajsonld.path", re.sub(r"\.json$", ".jsonld", DataJsonPlugin.route_path)) @@ -61,36 +48,203 @@ def update_config(self, config): # relative to the path of *this* file. Wow. p.toolkit.add_template_directory(config, "templates") + # IRoutes + + def before_map(self, m): + return m + + def after_map(self, m): + # /data.json and /data.jsonld (or other path as configured by user) + m.connect('datajson', DataJsonPlugin.route_path, controller='ckanext.datajson.plugin:DataJsonController', + action='generate_json') + m.connect('datajsonld', DataJsonPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', + action='generate_jsonld') + + # /pod/validate + m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:DataJsonController', + action='validator') + + # /pod/data-listing + m.connect('datajsonhtml', "/pod/data-catalog", controller='ckanext.datajson.plugin:DataJsonController', + action='show_html_rendition') + + return m + + # IFacets + + def dataset_facets(self, facets, package_type): + # Add any facets specified in build_datajson.get_facet_fields() to the top + # of the facet list, and then put the CKAN default facets below that. + f = OrderedDict() + f.update(get_facet_fields()) + f.update(facets) + return f + + def group_facets(self, facets_dict, group_type, package_type): + return facets_dict + + def organization_facets(self, facets_dict, organization_type, package_type): + return facets_dict + + +class DataJsonController(BaseController): + def generate_output(self, fmt): + # set content type (charset required or pylons throws an error) + response.content_type = 'application/json; charset=UTF-8' + + # allow caching of response (e.g. by Apache) + del response.headers["Cache-Control"] + del response.headers["Pragma"] + + # output + data = self.make_json() + + if fmt == 'json-ld': + # Convert this to JSON-LD. + data = OrderedDict([ + ("@context", OrderedDict([ + ("rdfs", "http://www.w3.org/2000/01/rdf-schema#"), + ("dcterms", "http://purl.org/dc/terms/"), + ("dcat", "http://www.w3.org/ns/dcat#"), + ("foaf", "http://xmlns.com/foaf/0.1/"), + ("pod", "http://project-open-data.github.io/schema/2013-09-20_1.0#"), + ])), + ("@id", DataJsonPlugin.ld_id), + ("@type", "dcat:Catalog"), + ("dcterms:title", DataJsonPlugin.ld_title), + ("rdfs:label", DataJsonPlugin.ld_title), + ("foaf:homepage", DataJsonPlugin.site_url), + ("dcat:dataset", [dataset_to_jsonld(d) for d in data]), + ]) + + return p.toolkit.literal(json.dumps(data, indent=2)) + + def make_json(self): + # Build the data.json file. + packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) + return [make_datajson_entry(pkg) for pkg in packages if pkg["type"] == "dataset"] + + def generate_json(self): + return self.generate_output('json') + + def generate_jsonld(self): + return self.generate_output('json-ld') + + def validator(self): + # Validates that a URL is a good data.json file. + if request.method == "POST" and "url" in request.POST and request.POST["url"].strip() != "": + c.source_url = request.POST["url"] + c.errors = [] + + import urllib + import json + from datajsonvalidator import do_validation + + body = None + try: + body = json.load(urllib.urlopen(c.source_url)) + except IOError as e: + c.errors.append(("Error Loading File", ["The address could not be loaded: " + unicode(e)])) + except ValueError as e: + c.errors.append(("Invalid JSON", ["The file does not meet basic JSON syntax requirements: " + unicode( + e) + ". Try using JSONLint.com."])) + except Exception as e: + c.errors.append(( + "Internal Error", + ["Something bad happened while trying to load and parse the file: " + unicode(e)])) + + if body: + try: + do_validation(body, c.source_url, c.errors) + except Exception as e: + c.errors.append(("Internal Error", ["Something bad happened: " + unicode(e)])) + if len(c.errors) == 0: + c.errors.append(("No Errors", ["Great job!"])) + + return render('datajsonvalidator.html') + + def show_html_rendition(self): + # Shows an HTML rendition of the data.json file. Requests the file live + # from http://localhost/data.json. + + import urllib + import json + + try: + c.catalog_data = json.load(urllib.urlopen("http://localhost/data.json")) + except Exception as e: + c.catalog_data = [] + + c.catalog_data.sort(key=lambda x: x.get("modified"), reverse=True) + + return render('html_rendition.html') + + +class JsonExportPlugin(p.SingletonPlugin): + p.implements(p.interfaces.IConfigurer) + p.implements(p.interfaces.IRoutes, inherit=True) + + def update_config(self, config): + # Must use IConfigurer rather than IConfigurable because only IConfigurer + # is called before after_map, in which we need the configuration directives + # to know how to set the paths. + + # TODO commenting out enterprise data inventory for right now + # JsonExportPlugin.route_edata_path = config.get("ckanext.enterprisedatajson.path", "/enterprisedata.json") + JsonExportPlugin.route_enabled = config.get("ckanext.datajson.url_enabled", "True") == 'True' + JsonExportPlugin.route_path = config.get("ckanext.datajson.path", "/data.json") + JsonExportPlugin.route_ld_path = config.get("ckanext.datajsonld.path", + re.sub(r"\.json$", ".jsonld", JsonExportPlugin.route_path)) + JsonExportPlugin.ld_id = config.get("ckanext.datajsonld.id", config.get("ckan.site_url")) + JsonExportPlugin.ld_title = config.get("ckan.site_title", "Catalog") + JsonExportPlugin.site_url = config.get("ckan.site_url") + + # Adds our local templates directory. It's smart. It knows it's + # relative to the path of *this* file. Wow. + p.toolkit.add_template_directory(config, "templates") + def before_map(self, m): return m def after_map(self, m): - if DataJsonPlugin.route_enabled: + if JsonExportPlugin.route_enabled: # /data.json and /data.jsonld (or other path as configured by user) - m.connect('datajson', DataJsonPlugin.route_path, controller='ckanext.datajson.plugin:DataJsonController', + m.connect('datajson_export', JsonExportPlugin.route_path, + controller='ckanext.datajson.plugin:JsonExportController', action='generate_json') # TODO commenting out enterprise data inventory for right now - # m.connect('enterprisedatajson', DataJsonPlugin.route_edata_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_enterprise') - #m.connect('datajsonld', DataJsonPlugin.route_ld_path, controller='ckanext.datajson.plugin:DataJsonController', action='generate_jsonld') + # m.connect('enterprisedatajson', JsonExportPlugin.route_edata_path, + # controller='ckanext.datajson.plugin:JsonExportController', action='generate_enterprise') + + # m.connect('datajsonld', JsonExportPlugin.route_ld_path, + # controller='ckanext.datajson.plugin:JsonExportController', action='generate_jsonld') # TODO DWC update action # /data/{org}/data.json m.connect('public_data_listing', '/organization/{org}/data.json', - controller='ckanext.datajson.plugin:DataJsonController', action='generate_pdl') + controller='ckanext.datajson.plugin:JsonExportController', action='generate_pdl') # TODO DWC update action # /data/{org}/edi.json m.connect('enterprise_data_inventory', '/organization/{org}/edi.json', - controller='ckanext.datajson.plugin:DataJsonController', action='generate_edi') + controller='ckanext.datajson.plugin:JsonExportController', action='generate_edi') + + # TODO DWC update action + # /data/{org}/edi.json + m.connect('enterprise_data_inventory', '/organization/{org}/draft.json', + controller='ckanext.datajson.plugin:JsonExportController', action='generate_draft') # /pod/validate - # m.connect('datajsonvalidator', "/pod/validate", controller='ckanext.datajson.plugin:DataJsonController', action='validator') + # m.connect('datajsonvalidator', "/pod/validate", + # controller='ckanext.datajson.plugin:JsonExportController', action='validator') return m -class DataJsonController(BaseController): - def generate_output(self, format): +class JsonExportController(BaseController): + _errors_json = [] + + def generate_output(self, fmt): # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -100,9 +254,9 @@ def generate_output(self, format): # TODO special processing for enterprise # output - data = make_json() + data = self.make_json() - if format == 'json-ld': + if fmt == 'json-ld': # Convert this to JSON-LD. data = OrderedDict([ ("@context", OrderedDict([ @@ -110,13 +264,12 @@ def generate_output(self, format): ("dcterms", "http://purl.org/dc/terms/"), ("dcat", "http://www.w3.org/ns/dcat#"), ("foaf", "http://xmlns.com/foaf/0.1/"), - ]) - ), - ("@id", DataJsonPlugin.ld_id), + ])), + ("@id", JsonExportPlugin.ld_id), ("@type", "dcat:Catalog"), - ("dcterms:title", DataJsonPlugin.ld_title), - ("rdfs:label", DataJsonPlugin.ld_title), - ("foaf:homepage", DataJsonPlugin.site_url), + ("dcterms:title", JsonExportPlugin.ld_title), + ("rdfs:label", JsonExportPlugin.ld_title), + ("foaf:homepage", JsonExportPlugin.site_url), ("dcat:dataset", [dataset_to_jsonld(d) for d in data]), ]) @@ -134,7 +287,8 @@ def validator(self): c.source_url = request.POST["url"] c.errors = [] - import urllib, json + import urllib + import json from datajsonvalidator import do_validation body = None @@ -147,7 +301,8 @@ def validator(self): e) + ". Try using JSONLint.com."])) except Exception as e: c.errors.append(( - "Internal Error", ["Something bad happened while trying to load and parse the file: " + unicode(e)])) + "Internal Error", + ["Something bad happened while trying to load and parse the file: " + unicode(e)])) if body: try: @@ -160,11 +315,12 @@ def validator(self): return render('datajsonvalidator.html') def generate_pdl(self): - # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty + # DWC this is a hack, as I couldn't get to the request parameters. + # For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/data.json", request.path) - #If user is not editor or admin of the organization then don't allow pdl download - if p.toolkit.check_access('package_create', {'model': model,'user':c.user}, {'owner_org': match.group(1)}): + # If user is not editor or admin of the organization then don't allow pdl download + if p.toolkit.check_access('package_create', {'model': model, 'user': c.user}, {'owner_org': match.group(1)}): if match: # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -172,15 +328,16 @@ def generate_pdl(self): # allow caching of response (e.g. by Apache) del response.headers["Cache-Control"] del response.headers["Pragma"] - return make_pdl(match.group(1)) + return self.make_pdl(match.group(1)) return "Invalid organization id" def generate_edi(self): - # DWC this is a hack, as I couldn't get to the request parameters. For whatever reason, the multidict was always empty + # DWC this is a hack, as I couldn't get to the request parameters. + # For whatever reason, the multidict was always empty match = re.match(r"/organization/([-a-z0-9]+)/edi.json", request.path) - #If user is not editor or admin of the organization then don't allow edi download - if p.toolkit.check_access('package_create', {'model': model,'user':c.user}, {'owner_org': match.group(1)}): + # If user is not editor or admin of the organization then don't allow edi download + if p.toolkit.check_access('package_create', {'model': model, 'user': c.user}, {'owner_org': match.group(1)}): if match: # set content type (charset required or pylons throws an error) response.content_type = 'application/json; charset=UTF-8' @@ -188,167 +345,310 @@ def generate_edi(self): # allow caching of response (e.g. by Apache) del response.headers["Cache-Control"] del response.headers["Pragma"] - return make_edi(match.group(1)) + return self.make_edi(match.group(1)) return "Invalid organization id" + def generate_draft(self): + # DWC this is a hack, as I couldn't get to the request parameters. + # For whatever reason, the multidict was always empty + match = re.match(r"/organization/([-a-z0-9]+)/draft.json", request.path) -def make_json(): - # Build the data.json file. - packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) - output = [] - # Create data.json only using public and public-restricted datasets, datasets marked non-public are not exposed - for pkg in packages: - extras = dict([(x['key'], x['value']) for x in pkg['extras']]) - try: - if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = make_datajson_entry(pkg) - if datajson_entry: + # If user is not editor or admin of the organization then don't allow edi download + if p.toolkit.check_access('package_create', {'model': model, 'user': c.user}, {'owner_org': match.group(1)}): + if match: + # set content type (charset required or pylons throws an error) + response.content_type = 'application/json; charset=UTF-8' + + # allow caching of response (e.g. by Apache) + del response.headers["Cache-Control"] + del response.headers["Pragma"] + return self.make_draft(match.group(1)) + return "Invalid organization id" + + def make_json(self): + # Build the data.json file. + packages = p.toolkit.get_action("current_package_list_with_resources")(None, {}) + output = [] + # Create data.json only using public and public-restricted datasets, datasets marked non-public are not exposed + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + try: + if not (re.match(r'[Nn]on-public', extras['public_access_level'])): + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if datajson_entry: + output.append(datajson_entry) + else: + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) + except KeyError: + publisher = self.detect_publisher(extras) + + logger.warn( + "Dataset id=[%s], title=[%s], organization=[%s] missing required 'public_access_level' field", + pkg.get('id', None), + pkg.get('title', None), + publisher) + + errors = ['Missing Required Field', ['public_access_level']] + + self._errors_json.append(OrderedDict([ + ('id', pkg.get('id')), + ('name', pkg.get('name')), + ('title', pkg.get('title')), + ('organization', publisher), + ('errors', errors), + ])) + pass + return output + + def make_draft(self, owner_org): + # Error handler for creating error log + stream = StringIO.StringIO() + eh = logging.StreamHandler(stream) + eh.setLevel(logging.WARN) + formatter = logging.Formatter('%(asctime)s - %(message)s') + eh.setFormatter(formatter) + logger.addHandler(eh) + + # Build the data.json file. + packages = self.get_packages(owner_org) + + errors_json = [] + + output = [] + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + if 'publishing_status' not in extras.keys() or extras['publishing_status'] != 'Draft': + continue + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if 'errors' in datajson_entry.keys(): + errors_json.append(datajson_entry) + datajson_entry = None + if datajson_entry and self.is_valid(datajson_entry): + output.append(datajson_entry) + else: + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) + + # Get the error log + eh.flush() + error = stream.getvalue() + eh.close() + logger.removeHandler(eh) + stream.close() + + # return json.dumps(output) + return self.write_zip(output, error, errors_json, zip_name='draft') + + @staticmethod + def detect_publisher(extras): + publisher = None + + if 'publisher' in extras and extras['publisher']: + publisher = JsonExportBuilder.strip_if_string(extras['publisher']) + + for i in range(1, 6): + key = 'publisher_' + str(i) + if key in extras and extras[key] and JsonExportBuilder.strip_if_string(extras[key]): + publisher = JsonExportBuilder.strip_if_string(extras[key]) + return publisher + + def make_edi(self, owner_org): + # Error handler for creating error log + stream = StringIO.StringIO() + eh = logging.StreamHandler(stream) + eh.setLevel(logging.WARN) + formatter = logging.Formatter('%(asctime)s - %(message)s') + eh.setFormatter(formatter) + logger.addHandler(eh) + + # Build the data.json file. + packages = self.get_packages(owner_org) + + output = [] + errors_json = [] + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + if 'publishing_status' in extras.keys() and extras['publishing_status'] == 'Draft': + continue + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if 'errors' in datajson_entry.keys(): + errors_json.append(datajson_entry) + datajson_entry = None + if datajson_entry and self.is_valid(datajson_entry): + output.append(datajson_entry) + else: + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) + + # Get the error log + eh.flush() + error = stream.getvalue() + eh.close() + logger.removeHandler(eh) + stream.close() + + # return json.dumps(output) + return self.write_zip(output, error, errors_json, zip_name='edi') + + def make_pdl(self, owner_org): + # Error handler for creating error log + stream = StringIO.StringIO() + eh = logging.StreamHandler(stream) + eh.setLevel(logging.WARN) + formatter = logging.Formatter('%(asctime)s - %(message)s') + eh.setFormatter(formatter) + logger.addHandler(eh) + + # Build the data.json file. + packages = self.get_packages(owner_org) + + output = [] + errors_json = [] + # Create data.json only using public datasets, datasets marked non-public are not exposed + for pkg in packages: + extras = dict([(x['key'], x['value']) for x in pkg['extras']]) + if 'publishing_status' in extras.keys() and extras['publishing_status'] == 'Draft': + continue + try: + if re.match(r'[Nn]on-public', extras['public_access_level']): + continue + datajson_entry = JsonExportBuilder.make_datajson_export_entry(pkg) + if 'errors' in datajson_entry.keys(): + errors_json.append(datajson_entry) + datajson_entry = None + if datajson_entry and self.is_valid(datajson_entry): output.append(datajson_entry) else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) - except KeyError: - logger.warn("Dataset id=[%s], title=[%s] missing required 'public_access_level' field", pkg.get('id', None), - pkg.get('title', None)) - pass - return output - - -def make_edi(owner_org): - # Error handler for creating error log - stream = StringIO.StringIO() - eh = logging.StreamHandler(stream) - eh.setLevel(logging.WARN) - formatter = logging.Formatter('%(asctime)s - %(message)s') - eh.setFormatter(formatter) - logger.addHandler(eh) - - # Build the data.json file. - packages = get_packages(owner_org) - - output = [] - for pkg in packages: - #if pkg['owner_org'] == owner_org: - datajson_entry = make_datajson_entry(pkg) - if datajson_entry and is_valid(datajson_entry): - output.append(datajson_entry) + publisher = self.detect_publisher(extras) + logger.warn("Dataset id=[%s], title=[%s], organization=[%s] omitted\n", pkg.get('id', None), + pkg.get('title', None), publisher) + + except KeyError: + publisher = self.detect_publisher(extras) + + logger.warn( + "Dataset id=[%s], title=['%s'], organization=['%s'] missing required 'public_access_level' field", + pkg.get('id', None), pkg.get('title', None), publisher) + + errors = ['Missing Required Field', ['public_access_level']] + + self._errors_json.append(OrderedDict([ + ('id', pkg.get('id')), + ('name', pkg.get('name')), + ('title', pkg.get('title')), + ('organization', publisher), + ('errors', errors), + ])) + pass + + # Get the error log + eh.flush() + error = stream.getvalue() + eh.close() + logger.removeHandler(eh) + stream.close() + + # return json.dumps(output) + return self.write_zip(output, error, errors_json, zip_name='pdl') + + def get_packages(self, owner_org): + # Build the data.json file. + packages = self.get_all_group_packages(group_id=owner_org) + # get packages for sub-agencies. + sub_agency = model.Group.get(owner_org) + if 'sub-agencies' in sub_agency.extras.col.target \ + and sub_agency.extras.col.target['sub-agencies'].state == 'active': + sub_agencies = sub_agency.extras.col.target['sub-agencies'].value + sub_agencies_list = sub_agencies.split(",") + for sub in sub_agencies_list: + sub_packages = self.get_all_group_packages(group_id=sub) + for sub_package in sub_packages: + packages.append(sub_package) + + return packages + + def get_all_group_packages(self, group_id): + """ + Gets all of the group packages, public or private, returning them as a list of CKAN's dictized packages. + """ + result = [] + for pkg_rev in model.Group.get(group_id).packages(with_private=True, context={'user_is_admin': True}): + result.append(model_dictize.package_dictize(pkg_rev, {'model': model})) + + return result + + def is_valid(self, instance): + """ + Validates a data.json entry against the project open data's JSON schema. + Log a warning message on validation error + """ + error = best_match(validator.iter_errors(instance)) + if error: + logger.warn("Validation failed, best guess of error = %s", error) + return False + return True + + def write_zip(self, data, error=None, errors_json=None, zip_name='data'): + """ + Data: a python object to write to the data.json + Error: unicode string representing the content of the error log. + zip_name: the name to use for the zip file + """ + import zipfile + + o = StringIO.StringIO() + zf = zipfile.ZipFile(o, mode='w') + + data_file_name = 'data.json' + if 'draft' == zip_name: + data_file_name = 'draft_data.json' + + # Write the data file + if data: + zf.writestr(data_file_name, + json.dumps(JsonExportBuilder.make_datajson_export_catalog(data), ensure_ascii=False).encode( + 'utf8')) + # Write empty.json if nothing to return else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) - - # Get the error log - eh.flush() - error = stream.getvalue() - eh.close() - logger.removeHandler(eh) - stream.close() - - #return json.dumps(output) - return write_zip(output, error, zip_name='edi') - - -def make_pdl(owner_org): - # Error handler for creating error log - stream = StringIO.StringIO() - eh = logging.StreamHandler(stream) - eh.setLevel(logging.WARN) - formatter = logging.Formatter('%(asctime)s - %(message)s') - eh.setFormatter(formatter) - logger.addHandler(eh) - - # Build the data.json file. - packages = get_packages(owner_org) - - output = [] - #Create data.json only using public datasets, datasets marked non-public are not exposed - for pkg in packages: - extras = dict([(x['key'], x['value']) for x in pkg['extras']]) - try: - if not (re.match(r'[Nn]on-public', extras['public_access_level'])): - datajson_entry = make_datajson_entry(pkg) - if datajson_entry and is_valid(datajson_entry): - output.append(datajson_entry) - else: - logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None)) - - except KeyError: - logger.warn("Dataset id=[%s], title=['%s'] missing required 'public_access_level' field", - pkg.get('id', None), pkg.get('title', None)) - pass - - # Get the error log - eh.flush() - error = stream.getvalue() - eh.close() - logger.removeHandler(eh) - stream.close() - - #return json.dumps(output) - return write_zip(output, error, zip_name='pdl') - -def get_packages(owner_org): - # Build the data.json file. - packages = get_all_group_packages(group_id=owner_org) - #get packages for sub-agencies. - sub_agency = model.Group.get(owner_org) - if 'sub-agencies' in sub_agency.extras.col.target and \ - sub_agency.extras.col.target['sub-agencies'].state == 'active': - sub_agencies = sub_agency.extras.col.target['sub-agencies'].value - sub_agencies_list = sub_agencies.split(",") - for sub in sub_agencies_list: - sub_packages = get_all_group_packages(group_id=sub) - for sub_package in sub_packages: - packages.append(sub_package) - - return packages - -def get_all_group_packages(group_id): - """ - Gets all of the group packages, public or private, returning them as a list of CKAN's dictized packages. - """ - result = [] - for pkg_rev in model.Group.get(group_id).packages(with_private=True, context={'user_is_admin': True}): - result.append(model_dictize.package_dictize(pkg_rev, {'model': model})) - - return result - - -def is_valid(instance): - """ - Validates a data.json entry against the project open data's JSON schema. Log a warning message on validation error - """ - error = best_match(validator.iter_errors(instance)) - if error: - logger.warn("Validation failed, best guess of error = %s", error) - return False - return True - - -def write_zip(data, error=None, zip_name='data'): - """ - Data: a python object to write to the data.json - Error: unicode string representing the content of the error log. - zip_name: the name to use for the zip file - """ - import zipfile - - o = StringIO.StringIO() - zf = zipfile.ZipFile(o, mode='w') - - # Write the data file - if data: - zf.writestr('data.json', json.dumps(make_datajson_catalog(data), ensure_ascii=False).encode('utf8')) - - #Write the error log - if error: - zf.writestr('errorlog.txt', error.encode('utf8')) - - zf.close() - o.seek(0) - - binary = o.read() - o.close() - - response.content_type = 'application/octet-stream' - response.content_disposition = 'attachment; filename="%s.zip"' % zip_name - - return binary + zf.writestr('empty.json', '') + + if self._errors_json: + if errors_json: + errors_json += self._errors_json + else: + errors_json = self._errors_json + + # Errors in json format + if errors_json: + zf.writestr('errors.json', json.dumps(errors_json).encode('utf8')) + + # Write the error log + if error: + zf.writestr('errorlog.txt', error.encode('utf8')) + + zf.close() + o.seek(0) + + binary = o.read() + o.close() + + response.content_type = 'application/octet-stream' + response.content_disposition = 'attachment; filename="%s.zip"' % zip_name + + return binary + + +def get_validator(): + import os + from jsonschema import Draft4Validator, FormatChecker + + schema_path = os.path.join(os.path.dirname(__file__), 'pod_schema', 'federal-v1.1', 'dataset.json') + with open(schema_path, 'r') as schema: + schema = json.loads(schema.read()) + return Draft4Validator(schema, format_checker=FormatChecker()) +validator = get_validator() \ No newline at end of file diff --git a/ckanext/datajson/schema/federal-v1.1/catalog.json b/ckanext/datajson/pod_schema/federal-v1.1/catalog.json similarity index 100% rename from ckanext/datajson/schema/federal-v1.1/catalog.json rename to ckanext/datajson/pod_schema/federal-v1.1/catalog.json diff --git a/ckanext/datajson/schema/federal-v1.1/dataset.json b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json similarity index 84% rename from ckanext/datajson/schema/federal-v1.1/dataset.json rename to ckanext/datajson/pod_schema/federal-v1.1/dataset.json index 8fdfce22..21b09dbe 100644 --- a/ckanext/datajson/schema/federal-v1.1/dataset.json +++ b/ckanext/datajson/pod_schema/federal-v1.1/dataset.json @@ -62,6 +62,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -77,7 +81,7 @@ "uniqueItems": true }, "contactPoint": { - "$ref": "vcard.json" + "$ref": "#/definitions/vcard" }, "describedBy": { "title": "Data Dictionary", @@ -89,6 +93,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -102,6 +110,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -115,6 +127,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -127,6 +143,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -143,13 +163,25 @@ { "type": "array", "items": { - "$ref": "distribution.json", - "minItems": 1, - "uniqueItems": true + "anyOf": [ + { + "$ref": "#/definitions/distribution", + "minItems": 1, + "uniqueItems": true + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] } }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -169,18 +201,30 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, "keyword": { "title": "Tags", "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "landingPage": { "title": "Homepage URL", @@ -192,6 +236,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -208,6 +256,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -221,6 +273,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -239,6 +295,10 @@ { "type": "string", "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -252,6 +312,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -267,7 +331,7 @@ "uniqueItems": true }, "publisher": { - "$ref": "organization.json" + "$ref": "#/definitions/organization" }, "references": { "title": "Related Documents", @@ -276,14 +340,26 @@ { "type": "array", "items": { - "type": "string", - "format": "uri" + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "minItems": 1, "uniqueItems": true }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -331,6 +407,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -341,6 +421,9 @@ { "type": "string", "minLength": 1 + }, + { + "type": "null" } ] }, @@ -359,6 +442,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -397,8 +484,16 @@ "hasEmail": { "title": "Email", "description": "Email address for the contact", - "pattern": "^mailto:([\\w.-]+@[\\w.-]+\\.[\\w.-]+)?$", - "type": "string" + "anyOf": [ + { + "pattern": "^mailto:[\\w\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:.-]+@[\\w.-]+\\.[\\w.-]+?$", + "type": "string" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] } } }, @@ -432,8 +527,16 @@ "downloadURL": { "title": "Download URL", "description": "URL providing direct access to a downloadable file of a dataset", - "type": "string", - "format": "uri" + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "mediaType": { "title": "Media Type", @@ -445,6 +548,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -471,6 +578,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -510,6 +621,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -523,6 +638,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -536,6 +655,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] } @@ -566,7 +689,7 @@ }, "subOrganizationOf": { "title": "Parent Organization", - "$ref": "organization.json" + "$ref": "#" } } } diff --git a/ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json b/ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json new file mode 100644 index 00000000..95fcd75c --- /dev/null +++ b/ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json @@ -0,0 +1,58 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/catalog.json#", + "title": "Project Open Data Catalog", + "description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.", + "type": "object", + "dependencies": { + "@type": [ + "@context" + ] + }, + "required": [ + "conformsTo", + "dataset" + ], + "properties": { + "@context": { + "title": "Metadata Context", + "description": "URL or JSON object for the JSON-LD Context that defines the schema used", + "type": "string", + "format": "uri" + }, + "@id": { + "title": "Metadata Catalog ID", + "description": "IRI for the JSON-LD Node Identifier of the Catalog. This should be the URL of the data.json file itself.", + "type": "string", + "format": "uri" + }, + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Catalog for the Catalog", + "enum": [ + "dcat:Catalog" + ] + }, + "conformsTo": { + "description": "Version of Schema", + "title": "Version of Schema", + "enum": [ + "https://project-open-data.cio.gov/v1.1/schema" + ] + }, + "describedBy": { + "description": "URL for the JSON Schema file that defines the schema used", + "title": "Data Dictionary", + "type": "string", + "format": "uri" + }, + "dataset": { + "type": "array", + "items": { + "$ref": "dataset-non-federal.json", + "minItems": 1, + "uniqueItems": true + } + } + } +} \ No newline at end of file diff --git a/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json new file mode 100644 index 00000000..b131a63b --- /dev/null +++ b/ckanext/datajson/pod_schema/non-federal-v1.1/dataset-non-federal.json @@ -0,0 +1,569 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/dataset-non-federal.json#", + "title": "Project Open Data Dataset", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": [ + "title", + "description", + "publisher", + "contactPoint", + "identifier", + "accessLevel" + ], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Dataset for each Dataset", + "enum": [ + "dcat:Dataset" + ] + }, + "accessLevel": { + "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": [ + "public", + "restricted public", + "non-public" + ] + }, + "rights": { + "title": "Rights", + "description": "This may include information regarding access or restrictions based on privacy, security, or other policies. This should also provide an explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 255 + }, + { + "type": "null" + } + ] + }, + "accrualPeriodicity": { + "title": "Frequency", + "description": "Frequency with which dataset is published.", + "anyOf": [ + { + "enum": [ + "irregular" + ] + }, + { + "type": "string", + "pattern": "^R\\/P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "null" + } + ] + }, + "bureauCode": { + "title": "Bureau Code", + "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + "contactPoint": { + "$ref": "#/definitions/vcard-non-federal" + }, + "describedBy": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "describedByType": { + "title": "Data Dictionary Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s describedBy URL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + } + ] + }, + "conformsTo": { + "title": "Data Standard", + "description": "URI used to identify a standardized specification the dataset conforms to", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataQuality": { + "title": "Data Quality", + "description": "Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string", + "minLength": 1 + }, + "distribution": { + "title": "Distribution", + "description": "A container for the array of Distribution objects", + "anyOf": [ + { + "type": "array", + "items": { + "$ref": "#/definitions/distribution", + "minItems": 1, + "uniqueItems": true + } + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title": "Unique Identifier", + "description": "A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "minLength": 1 + }, + "issued": { + "title": "Release Date", + "description": "Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + "landingPage": { + "title": "Homepage URL", + "description": "Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title": "Language", + "description": "The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title": "License", + "description": "The license dataset or API is published with. See Open Licenses for more information.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + } + ] + }, + "primaryITInvestmentUII": { + "title": "Primary IT Investment UII", + "description": "For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programCode": { + "title": "Program Code", + "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + "publisher": { + "$ref": "#/definitions/organization" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemOfRecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title": "Temporal", + "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(R\\d*\\/)?P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "null" + } + ] + }, + "isPartOf": { + "title": "Collection", + "description": "The collection of which the dataset is a subset", + "anyOf": [ + { + "type": "string", + "minLength": 1 + } + ] + }, + "theme": { + "title": "Category", + "description": "Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string", + "minLength": 1 + } + }, + "definitions": { + "vcard-non-federal": { + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/vcard-non-federal.json#", + "title": "Project Open Data ContactPoint vCard", + "description": "A Dataset ContactPoint as a vCard object", + "type": "object", + "required": [ + "fn" + ], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be vcard:Contact for contactPoint", + "enum": [ + "vcard:Contact" + ] + }, + "fn": { + "title": "Contact Name", + "description": "A full formatted name, eg Firstname Lastname", + "type": "string", + "minLength": 1 + }, + "hasEmail": { + "title": "Email", + "description": "Email address for the contact", + "pattern": "^mailto:[\\w\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:.-]+@[\\w.-]+\\.[\\w.-]+?$", + "type": "string" + } + } + }, + "distribution": { + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/distribution.json#", + "title": "Project Open Data Distribution", + "description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.", + "type": "object", + "dependencies": { + "downloadURL": { + "properties": { + "mediaType": { + "type": "string", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" + } + }, + "required": [ + "mediaType" + ] + } + }, + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Distribution for each Distribution", + "enum": [ + "dcat:Distribution" + ] + }, + "downloadURL": { + "title": "Download URL", + "description": "URL providing direct access to a downloadable file of a dataset", + "type": "string", + "format": "uri" + }, + "mediaType": { + "title": "Media Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s downloadURL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + } + ] + }, + "format": { + "title": "Format", + "description": "A human-readable description of the file format of a distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "accessURL": { + "title": "Access URL", + "description": "URL providing indirect access to a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description of the distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "conformsTo": { + "title": "Data Standard", + "description": "URL providing indirect access to a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "describedBy": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the distribution found at the downloadURL", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "describedByType": { + "title": "Data Dictionary Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s describedBy URL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + } + ] + } + } + }, + "organization": { + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "https://project-open-data.cio.gov/v1.1/schema/organization.json#", + "title": "Project Open Data Organization", + "description": "A Dataset Publisher Organization as a foaf:Agent object", + "type": "object", + "required": [ + "name" + ], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be org:Organization for each publisher", + "enum": [ + "org:Organization" + ] + }, + "name": { + "title": "Publisher Name", + "description": "A full formatted name, eg Firstname Lastname", + "type": "string", + "minLength": 1 + }, + "subOrganizationOf": { + "title": "Parent Organization", + "$ref": "#" + } + } + } + } +} \ No newline at end of file diff --git a/ckanext/datajson/pod_schema/non-federal/single_entry.json b/ckanext/datajson/pod_schema/non-federal/single_entry.json new file mode 100644 index 00000000..ddc53fd4 --- /dev/null +++ b/ckanext/datajson/pod_schema/non-federal/single_entry.json @@ -0,0 +1,445 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", + "title": "Common Core Metadata Schema", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": [ + "title", + "description", + "license", + "publisher", + "contactPoint", + "identifier", + "accessLevel" + ], + "properties": { + "accessLevel": { + "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": [ + "public", + "restricted public", + "non-public" + ] + }, + "accessLevelComment": { + "title": "Access Level Comment", + "description": "An explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 255 + }, + { + "type": "null" + } + ] + }, + "accrualPeriodicity": { + "title": "Frequency", + "description": "Frequency with which dataset is published.", + "anyOf": [ + { + "enum": [ + "Annual", + "Bimonthly", + "Semiweekly", + "Daily", + "Biweekly", + "Semiannual", + "Biennial", + "Triennial", + "Three times a week", + "Three times a month", + "Continuously updated", + "Monthly", + "Quarterly", + "Semimonthly", + "Three times a year", + "Weekly", + "Completely irregular" + ] + }, + { + "type": "null" + } + ] + }, + "bureauCode": { + "title": "Bureau Code", + "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "contactPoint": { + "title": "Contact Name", + "description": "Contact person’s name for the asset.", + "type": "string" + }, + "dataDictionary": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataQuality": { + "title": "Data Quality", + "description": "Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string" + }, + "distribution": { + "title": "Distribution", + "description": "Holds multiple download URLs for datasets composed of multiple files and/or file types", + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": [ + "accessURL", + "format" + ], + "properties": { + "accessURL": { + "title": "Download URL", + "description": "URL providing direct access to the downloadable distribution of a dataset.", + "type": "string", + "format": "uri" + }, + "format": { + "title": "Format", + "description": "The file format or API type of the distribution.", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title": "Unique Identifier", + "description": "A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "pattern": "[\\w]+" + }, + "issued": { + "title": "Release Date", + "description": "Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + { + "type": "null" + } + ] + }, + "landingPage": { + "title": "Homepage URL", + "description": "Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title": "Language", + "description": "The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title": "License", + "description": "The license dataset or API is published with. See Open Licenses for more information.", + "type": "string", + "minLength": 1 + }, + "mbox": { + "title": "Contact Email", + "description": "Contact person’s email address.", + "anyOf": [ + { + "type": "string", + "format": "email" + }, + { + "type": "null" + }, + { + "type": "string" + } + ] + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + } + ] + }, + "PrimaryITInvestmentUII": { + "title": "Primary IT Investment UII", + "description": "For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programCode": { + "title": "Program Code", + "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "publisher": { + "title": "Publisher", + "description": "The publishing entity.", + "type": "string" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemOfRecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title": "Temporal", + "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "theme": { + "title": "Category", + "description": "Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string" + }, + "webService": { + "title": "Endpoint", + "description": "Endpoint of web service to access dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + } + } +} diff --git a/ckanext/datajson/pod_schema/single_entry.json b/ckanext/datajson/pod_schema/single_entry.json new file mode 100644 index 00000000..825203ad --- /dev/null +++ b/ckanext/datajson/pod_schema/single_entry.json @@ -0,0 +1,451 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", + "title": "Common Core Metadata Schema", + "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", + "type": "object", + "required": [ + "bureaucode", + "programcode", + "title", + "description", + "keyword", + "modified", + "publisher", + "contactpoint", + "mbox", + "identifier", + "accesslevel" + ], + "properties": { + "accesslevel": { + "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", + "title": "Public Access Level", + "enum": [ + "public", + "restricted public", + "non-public" + ] + }, + "accesslevelcomment": { + "title": "Access Level Comment", + "description": "An explanation for the selected \"accesslevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 255 + }, + { + "type": "null" + } + ] + }, + "accessurl": { + "title": "Download URL", + "description": "URL providing direct access to the downloadable distribution of a dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "accrualperiodicity": { + "title": "Frequency", + "description": "Frequency with which dataset is published.", + "anyOf": [ + { + "enum": [ + "Annual", + "Bimonthly", + "Semiweekly", + "Daily", + "Biweekly", + "Semiannual", + "Biennial", + "Triennial", + "Three times a week", + "Three times a month", + "Continuously updated", + "Monthly", + "Quarterly", + "Semimonthly", + "Three times a year", + "Weekly", + "Completely irregular" + ] + }, + { + "type": "null" + } + ] + }, + "bureaucode": { + "title": "Bureau Code", + "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + "contactpoint": { + "title": "Contact Name", + "description": "Contact person’s name for the asset.", + "type": "string" + }, + "datadictionary": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "dataquality": { + "title": "Data Quality", + "description": "Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", + "type": "string" + }, + "distribution": { + "title": "Distribution", + "description": "Holds multiple download URLs for datasets composed of multiple files and/or file types", + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": [ + "accessurl", + "format" + ], + "properties": { + "accessurl": { + "title": "Download URL", + "description": "URL providing direct access to the downloadable distribution of a dataset.", + "type": "string", + "format": "uri" + }, + "format": { + "title": "Format", + "description": "The file format or API type of the distribution.", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + } + } + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "format": { + "title": "Format", + "description": "The file format or API type of the distribution.", + "anyOf": [ + { + "type": "string", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" + }, + { + "type": "null" + } + ] + }, + "identifier": { + "title": "Unique Identifier", + "description": "A unique identifier for the dataset or API as maintained within an Agency catalog or database.", + "type": "string", + "pattern": "[\\w]+" + }, + "issued": { + "title": "Release Date", + "description": "Date of formal issuance.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "keyword": { + "title": "Tags", + "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + }, + "landingpage": { + "title": "Homepage URL", + "description": "Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + }, + "language": { + "title": "Language", + "description": "The language of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" + } + }, + { + "type": "null" + } + ] + }, + "license": { + "title": "License", + "description": "The license dataset or API is published with. See Open Licenses for more information.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "mbox": { + "title": "Contact Email", + "description": "Contact person’s email address.", + "type": "string", + "format": "email" + }, + "modified": { + "title": "Last Update", + "description": "Most recent date on which the dataset was changed, updated or modified.", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + } + ] + }, + "primaryitinvestmentuii": { + "title": "Primary IT Investment UII", + "description": "For linking a dataset with an IT Unique Investment Identifier (UII)", + "anyOf": [ + { + "type": "string", + "pattern": "[0-9]{3}-[0-9]{9}" + }, + { + "type": "null" + } + ] + }, + "programcode": { + "title": "Program Code", + "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{3}" + }, + "minItems": 1, + "uniqueItems": true + }, + "publisher": { + "title": "Publisher", + "description": "The publishing entity.", + "type": "string" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "systemofrecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "temporal": { + "title": "Temporal", + "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", + "anyOf": [ + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "string", + "pattern": "^P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" + }, + { + "type": "string", + "pattern": "^R\\d*\\/([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?\\/P(?=\\w*\\d)(?:\\d+Y|Y)?(?:\\d+M|M)?(?:\\d+W|W)?(?:\\d+D|D)?(?:T(?:\\d+H|H)?(?:\\d+M|M)?(?:\\d+(?:\\­.\\d{1,2})?S|S)?)?$" + }, + { + "type": "null" + } + ] + }, + "theme": { + "title": "Category", + "description": "Main thematic category of the dataset.", + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", + "type": "string" + }, + "webservice": { + "title": "Endpoint", + "description": "Endpoint of web service to access dataset.", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + } + ] + } + } +} diff --git a/ckanext/datajson/schema/1_0_final/single_entry.json b/ckanext/datajson/schema/1_0_final/single_entry.json deleted file mode 100644 index 4567f43c..00000000 --- a/ckanext/datajson/schema/1_0_final/single_entry.json +++ /dev/null @@ -1,207 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "id": "http://project-open-data.github.io/schema/1_0_final/single_entry.json#", - "title": "Common Core Metadata Schema", - "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", - "type": "object", - "required": ["title", "description", "keyword", "modified", "publisher", "contactPoint", "mbox", "identifier", "accessLevel"], - "properties": { - "accessLevel": { - "description":"The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", - "title": "Public Access Level", - "enum": ["public", "restricted public", "non-public"] - }, - "accessLevelComment": { - "title":"Access Level Comment", - "description":"An explanation for the selected \"accessLevel\" including instructions for how to access a restricted file, if applicable, or explanation for why a \"non-public\" or \"restricted public\" data assetis not \"public,\" if applicable. Text, 255 characters.", - "type": "string", - "maxLength":255 - }, - "accrualPeriodicity": { - "title":"Frequency", - "description":"Frequency with which dataset is published.", - "enum": ["Annual", "Bimonthly", "Semiweekly", "Daily", "Biweekly", "Semiannual", "Biennial", "Triennial", - "Three times a week", "Three times a month", "Continuously updated", "Monthly", "Quarterly", "Semimonthly", - "Three times a year", "Weekly", "Completely irregular"] - }, - "bureauCode": { - "title":"Bureau Code", - "description":"Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{2}" - }, - "minItems": 1, - "uniqueItems": true - }, - "contactPoint": { - "title":"Contact Name", - "description":"Contact person’s name for the asset.", - "type": "string" - }, - "dataDictionary": { - "title":"Data Dictionary", - "description":"URL to the data dictionary for the dataset or API. Note that documentation other than a data dictionary can be referenced using Related Documents as shown in the expanded fields.", - "type": "string", - "format": "uri" - }, - "dataQuality": { - "title":"Data Quality", - "description":"Whether the dataset meets the agency’s Information Quality Guidelines (true/false).", - "type": "boolean" - }, - "description": { - "title" : "Description", - "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", - "type": "string" - }, - "distribution": { - "title":"Distribution", - "description":"Holds multiple download URLs for datasets composed of multiple files and/or file types", - "type": "array", - "items": { - "type": "object", - "properties": { - "accessURL": { - "title":"Download URL", - "description":"URL providing direct access to the downloadable distribution of a dataset.", - "type": "string", - "format": "uri" - }, - "format": { - "title":"Format", - "description":"The file format or API type of the distribution.", - "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", - "type": "string" - } - } - }, - "minItems": 1, - "uniqueItems": true - }, - "identifier": { - "title":"Unique Identifier", - "description":"A unique identifier for the dataset or API as maintained within an Agency catalog or database.", - "type": "string", - "pattern": "[\\w]+" - }, - "issued": { - "title":"Release Date", - "description":"Date of formal issuance.", - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - "keyword": { - "title": "Tags", - "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "type": "array", - "items": { - "type": "string" - }, - "minItems": 1, - "uniqueItems": true - }, - "landingPage": { - "title":"Homepage URL", - "description":"Alternative landing page used to redirect user to a contextual, Agency-hosted “homepage” for the Dataset or API when selecting this resource from the Data.gov user interface.", - "type": "string", - "format": "uri" - }, - "language": { - "title":"Language", - "description":"The language of the dataset.", - "type": "array", - "items": { - "type": "string", - "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" - } - }, - "license": { - "title":"License", - "description":"The license dataset or API is published with. See Open Licenses for more information.", - "type": "string" - }, - "mbox": { - "title":"Contact Email", - "description":"Contact person’s email address.", - "type": "string", - "format": "email" - }, - "modified": { - "title": "Last Update", - "description": "Most recent date on which the dataset was changed, updated or modified.", - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - "PrimaryITInvestmentUII": { - "title":"Primary IT Investment UII", - "description":"For linking a dataset with an IT Unique Investment Identifier (UII)", - "type": "string" - }, - "programCode": { - "title":"Program Code", - "description":"Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{3}" - }, - "minItems": 1, - "uniqueItems": true - }, - "publisher": { - "title":"Publisher", - "description": "The publishing entity.", - "type": "string" - }, - "references": { - "title":"Related Documents", - "description":"Related documents such as technical information about a dataset, developer documentation, etc.", - "type": "array", - "items": { - "type": "string", - "format": "uri" - }, - "minItems": 1, - "uniqueItems": true - }, - "spatial": { - "title":"Spatial", - "description":"The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", - "type": "string" - }, - "systemOfRecords": { - "title":"System of Records", - "description":"If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", - "type": "string" - }, - "temporal": { - "title":"Temporal", - "description":"The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", - "type": "string", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$" - }, - "theme": { - "title":"Category", - "description":"Main thematic category of the dataset.", - "type": "array", - "items": { - "type": "string" - }, - "minItems": 1, - "uniqueItems": true - }, - "title": { - "title": "Title", - "description": "Human-readable name of the asset. Should be in plain English and include sufficient detail to facilitate search and discovery.", - "type": "string" - }, - "webService": { - "title":"Endpoint", - "description":"Endpoint of web service to access dataset.", - "type": "string", - "format": "uri" - } - } -} diff --git a/ckanext/datajson/templates/html_rendition.html b/ckanext/datajson/templates/html_rendition.html new file mode 100644 index 00000000..96ff1ee7 --- /dev/null +++ b/ckanext/datajson/templates/html_rendition.html @@ -0,0 +1,43 @@ +{% extends "page.html" %} + +{% block subtitle %}Data Catalog (HTML Table Rendition){% endblock %} + +{% block breadcrumb_content %} +{% endblock %} + +{% block primary %} +
+
+

Data Catalog

+ +

Welcome to the {{g.site_title}}. There are several ways you may view & download the data catalog:

+ + + +
+ + + {% for item in c.catalog_data %} + + + + {% endfor %} +
+

{{item.title}}

+
+

{{item.description}}

+

+ {% if item.accessURL %}{{item.accessURL}}
{% endif %} + Last Modified: {% if item.modified %}{{item.modified}}{% else %}unknown{% endif %}
+

+
+
+
+
+{% endblock %} + +{% block secondary %}{% endblock %} diff --git a/ckanext/datajson/templates/organization/read.html b/ckanext/datajson/templates/organization/read.html index fbcaca29..1c5d73ba 100644 --- a/ckanext/datajson/templates/organization/read.html +++ b/ckanext/datajson/templates/organization/read.html @@ -5,10 +5,12 @@ {% link_for _('Add Dataset'), controller='package', action='new', group=c.group_dict.id, class_='btn btn-primary', icon='plus-sign-alt' %} + {% endif %}
+
{% endblock %} diff --git a/requirements.txt b/pip-requirements.txt similarity index 76% rename from requirements.txt rename to pip-requirements.txt index 441b63d5..4f5e07df 100644 --- a/requirements.txt +++ b/pip-requirements.txt @@ -1,3 +1,4 @@ pyyaml lepl jsonschema +rfc3987 \ No newline at end of file diff --git a/setup.py b/setup.py index 4431576f..09c5f542 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ entry_points=\ """ [ckan.plugins] - datajson=ckanext.datajson:DataJsonPlugin + datajson=ckanext.datajson:DataJsonPlugin + datajson_export=ckanext.datajson:JsonExportPlugin datajson_harvest=ckanext.datajson:DataJsonHarvester cmsdatanav_harvest=ckanext.datajson:CmsDataNavigatorHarvester """,