Skip to content

Commit

Permalink
Merge pull request #55 from jimregan/regex-fix
Browse files Browse the repository at this point in the history
[ga] fix regex
  • Loading branch information
sffc committed Nov 6, 2019
2 parents ea46c00 + ba8c432 commit 4c6cb38
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion Lib/corpuscrawler/crawl_ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,12 @@ def _rte_writable_paragraph(text):
return False
if text.find('is not responsible for the content') >= 0:
return False
if text.find('RTÉ uses cookies in accordance with our Cookie Policy') >= 0:
return False
return True

def _check_rte_sitemap(url):
urlmatch = re.search(r'http://www.rte.ie/sitemap-([0-9]+)0000.xml', url)
urlmatch = re.search(r'https?://www.rte.ie/sitemap-([0-9]+)0000.xml', url)
try:
if int(urlmatch.group(1)) < 40:
return True
Expand Down

0 comments on commit 4c6cb38

Please sign in to comment.