Crawling the web with Python.


#1

I am stuck on Beginning to Extract Data in the Crawl the Web with Python course.
Here is the code…

from lxml import html
import requests

class AppCrawler:
	def __init__(self, starting_url, depth):
		self.starting_url = starting_url
		self.depth = depth
		self.apps = []

	def crawl(self):
		self.get_app_from_link(self.starting_url)
		return

	def get_app_from_link(self, link):
		start_page = requests.get(link)
		print start_page.text
		return

class App:
	def __init__(self, name, developer, price, links):
		self.name = name
		self.developer = developer
		self.price = price
		self.links = links

	def __str__():
		return("Name: " + self.name.encode('UTF-8') + 
			"\r\nDeveloper: " + self.developer.encode('UTF-8') +
			"\r\nPrice: " + self.price.encode('UTF-8') + "\r\n")

crawler = AppCrawler('http://itunes.apple.com/us/app/candy-crush-saga/id553834731', 0)
crawler.crawl()

for app in crawler.apps:
	print app

And the console log…


C:\Users\HP\Documents\Web Crawling>python spider.py
C:\Python27\lib\site-packages\requests\packages\urllib3\util\ssl_.py:315: SNIMissingWarning: An HTTPS request has been made, but the SNI (Subject Name Indication) extension to TLS is not available on this platform. This may cause the server to present an incorrect TLS certificate, which can cause validation failures. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#snimissingwarning.
SNIMissingWarning
C:\Python27\lib\site-packages\requests\packages\urllib3\util\ssl_.py:120: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
InsecurePlatformWarning
Traceback (most recent call last):
File “spider.py”, line 32, in
crawler.crawl()
File “spider.py”, line 11, in crawl
self.get_app_from_link(self.starting_url)
File “spider.py”, line 16, in get_app_from_link
print start_page.text
File “C:\Python27\lib\encodings\cp437.py”, line 12, in encode
return codecs.charmap_encode(input,errors,encoding_map)
UnicodeEncodeError: ‘charmap’ codec can’t encode character u’\u2019’ in position 43280: character maps to

Please help…

Anyhow, I resolved it by searching for SNIMIssingWarning and then installed something via pip.