Skip to content

Commit

Permalink
make (ssl) context a property, add setter
Browse files Browse the repository at this point in the history
  • Loading branch information
jimregan committed Nov 16, 2019
1 parent 3e0857a commit bc012db
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions Lib/corpuscrawler/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(self, language, output_dir, cache_dir):
self.useragent = 'LinguisticCorpusCrawler/1.0'
self.useragent_for_robots_txt = self.useragent.split('/')[0]
self.crawldelay = 15.0 # seconds between fetches
self.context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
for path in (output_dir, cache_dir):
if not os.path.exists(path):
os.makedirs(path)
Expand Down Expand Up @@ -168,9 +169,8 @@ def fetch(self, url, redirections=None):
delay = random.uniform(self.crawldelay, self.crawldelay + 2) # jitter
time.sleep(delay)
request = Request(url, headers={'User-Agent': self.useragent})
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
try:
response = urlopen(request, context=context)
response = urlopen(request, context=self.context)
except HTTPError as err:
response = err
except UnicodeDecodeError as err:
Expand Down Expand Up @@ -508,6 +508,10 @@ def crawl_voice_of_america(self, out, host, ignore_ascii=False):
out.write('\n'.join(paragraphs) + '\n')


def set_context(self, context):
self.context = context


# Normally we put site-specific logic into the language-specific scripts,
# but a couple sites have large amounts of text in many underserved languages,
# using the same site structure for all languages.
Expand Down

0 comments on commit bc012db

Please sign in to comment.