Skip to content

Commit

Permalink
strip cookie warnings, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
jimregan authored and brawer committed Nov 1, 2017
1 parent c53b003 commit 4ea2d51
Showing 1 changed file with 19 additions and 9 deletions.
28 changes: 19 additions & 9 deletions Lib/corpuscrawler/crawl_ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
import sys

from corpuscrawler.util import crawl_udhr, urlpath
from corpuscrawler.util import crawl_udhr, urlpath, striptags, cleantext

try:
import xml.etree.cElementTree as etree
Expand All @@ -39,7 +39,7 @@ def _rtenuacht_path(url):
return rtenuacht or rnagnuacht


def _fetch_rte_sitemap(crawler, url, processed=set(), url_filter=lambda x: True):
def _fetch_rte_sitemap(crawler, url, processed=set()):
"""'http://example.org/sitemap.xml' --> {url: lastmod}"""
result = {}
doc = crawler.fetch(url)
Expand Down Expand Up @@ -72,8 +72,6 @@ def _fetch_rte_sitemap(crawler, url, processed=set(), url_filter=lambda x: True)
if location is None:
continue
location = location.text.strip()
if not url_filter(location):
continue
lastmod = urlinfo.find(lastmodpath)
if lastmod is not None:
try:
Expand All @@ -85,13 +83,21 @@ def _fetch_rte_sitemap(crawler, url, processed=set(), url_filter=lambda x: True)
result[location] = lastmod
return result

def _rte_writable_paragraph(text):
if text == '':
return False
if text.startswith('© RTÉ '):
return False
if text.startswith('By using this website, you consent'):
return False
return True

def crawl_nuachtrte(crawler, out):
sitemap = _fetch_rte_sitemap(crawler,
'http://www.rte.ie/sitemap.xml',
url_filter=lambda s: _rtenuacht_path(s)
)
sitemap = _fetch_rte_sitemap(crawler, 'http://www.rte.ie/sitemap.xml')
pubdate_regex = re.compile(r'name="DC.date" (?:scheme="DCTERMS.URI" )?content="([0-9T:+\-]{19,25})"')
for url in sorted(sitemap.keys()):
if not _rtenuacht_path(url):
continue
fetchresult = crawler.fetch(url)
if fetchresult.status != 200:
continue
Expand All @@ -107,7 +113,11 @@ def crawl_nuachtrte(crawler, out):
if title: title = striptags(title.group(1).split('- RTÉ')[0]).strip()
if title: out.write(cleantext(title) + '\n')
for paragraph in re.findall(r'<p>(.+?)</p>', html):
out.write(cleantext(paragraph) + '\n')
cleaned = cleantext(paragraph)
if _rte_writable_paragraph(cleaned):
out.write(cleaned + '\n')
else:
continue



0 comments on commit 4ea2d51

Please sign in to comment.