Skip to content

Commit

Permalink
fix crawling
Browse files Browse the repository at this point in the history
  • Loading branch information
sixtedemaupeou committed Aug 29, 2022
1 parent 870813f commit 09d1db2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 18 deletions.
1 change: 1 addition & 0 deletions udata_hydra/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ async def init_db(drop=False, table=None, index=False, reindex=False):
"""
CREATE TABLE IF NOT EXISTS checks(
id serial PRIMARY KEY,
resource_id UUID,
url VARCHAR,
domain VARCHAR,
created_at TIMESTAMP DEFAULT NOW(),
Expand Down
34 changes: 16 additions & 18 deletions udata_hydra/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ async def insert_check(data: dict):
async def update_check_and_catalog(check_data: dict) -> None:
"""Update the catalog and checks tables"""
context.monitor().set_status("Updating checks and catalog...")
check_data['resource_id'] = str(check_data['resource_id'])

pool = await context.pool()
async with pool.acquire() as connection:
q = f"""
Expand Down Expand Up @@ -231,16 +233,22 @@ async def check_url(row, session, sleep=0, method="get"):
}
)
return STATUS_OK
except asyncio.exceptions.TimeoutError:
await update_check_and_catalog(
{
"resource_id": row["resource_id"],
"url": row["url"],
"domain": domain,
"timeout": True,
}
)
return STATUS_TIMEOUT
# TODO: debug AssertionError, should be caught in DB now
# File "[...]aiohttp/connector.py", line 991, in _create_direct_connection
# assert port is not None
# UnicodeError: encoding with 'idna' codec failed (UnicodeError: label too long)
# eg http://%20Localisation%20des%20acc%C3%A8s%20des%20offices%20de%20tourisme
except (
aiohttp.client_exceptions.ClientError,
AssertionError,
UnicodeError,
) as e:
except Exception as e:
error = getattr(e, "message", None) or str(e)
await update_check_and_catalog(
{
Expand All @@ -255,16 +263,6 @@ async def check_url(row, session, sleep=0, method="get"):
)
log.error(f"{row['url']}, {e}")
return STATUS_ERROR
except asyncio.exceptions.TimeoutError:
await update_check_and_catalog(
{
"resource_id": row["resource_id"],
"url": row["url"],
"domain": domain,
"timeout": True,
}
)
return STATUS_TIMEOUT


async def crawl_urls(to_parse):
Expand Down Expand Up @@ -294,7 +292,7 @@ async def crawl_batch():
# first urls that are prioritised
q = f"""
SELECT * FROM (
SELECT DISTINCT(catalog.url), dataset_id, resource_id
SELECT catalog.url, dataset_id, resource_id
FROM catalog
WHERE {excluded}
AND deleted = False
Expand All @@ -307,7 +305,7 @@ async def crawl_batch():
if len(to_check) < config.BATCH_SIZE:
q = f"""
SELECT * FROM (
SELECT DISTINCT(catalog.url), dataset_id, resource_id
SELECT catalog.url, dataset_id, resource_id
FROM catalog
WHERE catalog.last_check IS NULL
AND {excluded}
Expand All @@ -324,7 +322,7 @@ async def crawl_batch():
limit = config.BATCH_SIZE - len(to_check)
q = f"""
SELECT * FROM (
SELECT DISTINCT(catalog.url), dataset_id, resource_id
SELECT catalog.url, dataset_id, catalog.resource_id
FROM catalog, checks
WHERE catalog.last_check IS NOT NULL
AND {excluded}
Expand Down

0 comments on commit 09d1db2

Please sign in to comment.