Skip to content

Commit

Permalink
some of the files in the sitemap do not exist
Browse files Browse the repository at this point in the history
  • Loading branch information
jimregan authored and brawer committed Nov 1, 2017
1 parent bedad9d commit c53b003
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions Lib/corpuscrawler/crawl_ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def _fetch_rte_sitemap(crawler, url, processed=set(), url_filter=lambda x: True)
"""'http://example.org/sitemap.xml' --> {url: lastmod}"""
result = {}
doc = crawler.fetch(url)
assert doc.status == 200, (doc.status, url)
if doc.status != 200:
return None
content = doc.content
if content.startswith(b'\x1F\x8B'):
content = zlib.decompress(content, zlib.MAX_WBITS|32)
Expand All @@ -60,7 +61,11 @@ def _fetch_rte_sitemap(crawler, url, processed=set(), url_filter=lambda x: True)
if subsitemap in processed:
continue
processed.add(subsitemap)
result.update(crawler.fetch_sitemap(subsitemap, processed))
nextiter = _fetch_rte_sitemap(crawler, subsitemap, processed)
if nextiter is not None:
result.update(nextiter)
else:
break
locpath, lastmodpath = 'loc', 'lastmod'
for urlinfo in sitemap.findall('url') + sitemap.findall('{%s}url' % xmlns):
location = urlinfo.find(locpath)
Expand All @@ -71,8 +76,11 @@ def _fetch_rte_sitemap(crawler, url, processed=set(), url_filter=lambda x: True)
continue
lastmod = urlinfo.find(lastmodpath)
if lastmod is not None:
lastmod = lastmod.text.strip()
if len(lastmod) == 0:
try:
lastmod = lastmod.text.strip()
if len(lastmod) == 0:
lastmod = None
except AttributeError:
lastmod = None
result[location] = lastmod
return result
Expand Down

0 comments on commit c53b003

Please sign in to comment.