Skip to content

Commit

Permalink
use sansJson to sort package
Browse files Browse the repository at this point in the history
  • Loading branch information
FuhuXia committed Aug 30, 2023
1 parent 44b20e6 commit ada4689
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
31 changes: 26 additions & 5 deletions ckanext/datajson/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import json
import yaml
import os
import sansjson

from jsonschema.validators import Draft4Validator
from jsonschema import FormatChecker
Expand Down Expand Up @@ -266,19 +267,26 @@ def gather_stage(self, harvest_job):
# in the package so we can avoid updating datasets that
# don't look like they've changed.
source_hash = self.find_extra(pkg, "source_hash")

if source_hash is None:
try:
source_hash = json.loads(self.find_extra(pkg, "extras_rollup")).get("source_hash")
except TypeError:
source_hash = None
# use sha1 for existing hash created by older versions of function make_upstream_content_hash
# use sha256 for any new hash
# sha1 generates 40 characters, sha256 generates 64 characters
sha1_or_sha256 = "sha1" if len(source_hash) == 40 else "sha256"

if pkg.get("state") == "active" \
and dataset['identifier'] not in existing_parents_demoted \
and dataset['identifier'] not in existing_datasets_promoted \
and source_hash == self.make_upstream_content_hash(dataset,
source,
catalog_extras,
schema_version):
log.info('SKIP: {}'.format(dataset['identifier']))
schema_version,
sha1_or_sha256):
log.info('{} Match. SKIP: {}'.format(sha1_or_sha256, dataset['identifier']))
continue
else:
pkg_id = uuid.uuid4().hex
Expand Down Expand Up @@ -829,15 +837,28 @@ def import_stage(self, harvest_object):
return True

def make_upstream_content_hash(self, datasetdict, harvest_source,
catalog_extras, schema_version='1.0'):
catalog_extras, schema_version='1.0',
sha1_or_sha256='sha256'):
# sansjson.sort was added to sort dataset for better change detection.
# doing so we can avoid updating datasets that don't have meaningful changes. (i.e. keyword order)

# by default sansjson.sort and sha256 are used. sha1 is used for existing datasets,
# until the dataset is changed and the hash is updated to new sha256.
if sha1_or_sha256 == 'sha1':
hash_function = hashlib.sha1
else:
hash_function = hashlib.sha256
datasetdict = sansjson.sort_pyobject(datasetdict)

if schema_version == '1.0':
return hashlib.sha1(json.dumps(datasetdict, sort_keys=True) + # NOQA W504
return hash_function(json.dumps(datasetdict, sort_keys=True) + # NOQA W504
"|" + harvest_source.config + "|" + # NOQA W504
self.HARVESTER_VERSION).hexdigest()
else:
return hashlib.sha1((json.dumps(datasetdict, sort_keys=True) + "|" + json.dumps(catalog_extras,
return hash_function((json.dumps(datasetdict, sort_keys=True) + "|" + json.dumps(catalog_extras,
sort_keys=True)).encode('utf-8')).hexdigest()


def find_extra(self, pkg, key):
for extra in pkg["extras"]:
if extra["key"] == key:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ pika>=1.1.0,<1.3.0
enum34; python_version < '3.0' # Required by pika
redis
requests>=2.11.1
sansjson==0.3.0
six>=1.12.0

0 comments on commit ada4689

Please sign in to comment.