Skip to content

Commit

Permalink
handle mixed broken/unbroken namespaces
Browse files Browse the repository at this point in the history
  • Loading branch information
jimregan authored and brawer committed Nov 1, 2017
1 parent 63554b8 commit 039de18
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion Lib/corpuscrawler/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ def fetch_sitemap(self, url, processed=set(), subsitemap_filter=lambda x: True):
except etree.ParseError:
return {}
xmlns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # XML namespace
for s in sitemap.findall('{%s}sitemap/{%s}loc' % (xmlns, xmlns)):
submap1 = sitemap.findall('{%s}sitemap/{%s}loc' % (xmlns, xmlns))
submap2 = sitemap.findall('sitemap/loc')
for s in submap1 + submap2:
subsitemap = s.text.strip()
# prevent infinite recursion
if subsitemap in processed:
Expand Down

0 comments on commit 039de18

Please sign in to comment.