web.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. # encoding: utf-8
  2. #
  3. # Copyright (c) 2014 Dean Jackson <[email protected]>
  4. #
  5. # MIT Licence. See http://opensource.org/licenses/MIT
  6. #
  7. # Created on 2014-02-15
  8. #
  9. """Lightweight HTTP library with a requests-like interface."""
  10. import codecs
  11. import json
  12. import mimetypes
  13. import os
  14. import random
  15. import re
  16. import socket
  17. import string
  18. import unicodedata
  19. import urllib
  20. import urllib2
  21. import urlparse
  22. import zlib
  23. USER_AGENT = u'Alfred-Workflow/1.19 (+http://www.deanishe.net/alfred-workflow)'
  24. # Valid characters for multipart form data boundaries
  25. BOUNDARY_CHARS = string.digits + string.ascii_letters
  26. # HTTP response codes
  27. RESPONSES = {
  28. 100: 'Continue',
  29. 101: 'Switching Protocols',
  30. 200: 'OK',
  31. 201: 'Created',
  32. 202: 'Accepted',
  33. 203: 'Non-Authoritative Information',
  34. 204: 'No Content',
  35. 205: 'Reset Content',
  36. 206: 'Partial Content',
  37. 300: 'Multiple Choices',
  38. 301: 'Moved Permanently',
  39. 302: 'Found',
  40. 303: 'See Other',
  41. 304: 'Not Modified',
  42. 305: 'Use Proxy',
  43. 307: 'Temporary Redirect',
  44. 400: 'Bad Request',
  45. 401: 'Unauthorized',
  46. 402: 'Payment Required',
  47. 403: 'Forbidden',
  48. 404: 'Not Found',
  49. 405: 'Method Not Allowed',
  50. 406: 'Not Acceptable',
  51. 407: 'Proxy Authentication Required',
  52. 408: 'Request Timeout',
  53. 409: 'Conflict',
  54. 410: 'Gone',
  55. 411: 'Length Required',
  56. 412: 'Precondition Failed',
  57. 413: 'Request Entity Too Large',
  58. 414: 'Request-URI Too Long',
  59. 415: 'Unsupported Media Type',
  60. 416: 'Requested Range Not Satisfiable',
  61. 417: 'Expectation Failed',
  62. 500: 'Internal Server Error',
  63. 501: 'Not Implemented',
  64. 502: 'Bad Gateway',
  65. 503: 'Service Unavailable',
  66. 504: 'Gateway Timeout',
  67. 505: 'HTTP Version Not Supported'
  68. }
  69. def str_dict(dic):
  70. """Convert keys and values in ``dic`` into UTF-8-encoded :class:`str`.
  71. :param dic: :class:`dict` of Unicode strings
  72. :returns: :class:`dict`
  73. """
  74. if isinstance(dic, CaseInsensitiveDictionary):
  75. dic2 = CaseInsensitiveDictionary()
  76. else:
  77. dic2 = {}
  78. for k, v in dic.items():
  79. if isinstance(k, unicode):
  80. k = k.encode('utf-8')
  81. if isinstance(v, unicode):
  82. v = v.encode('utf-8')
  83. dic2[k] = v
  84. return dic2
  85. class NoRedirectHandler(urllib2.HTTPRedirectHandler):
  86. """Prevent redirections."""
  87. def redirect_request(self, *args):
  88. return None
  89. # Adapted from https://gist.github.com/babakness/3901174
  90. class CaseInsensitiveDictionary(dict):
  91. """Dictionary with caseless key search.
  92. Enables case insensitive searching while preserving case sensitivity
  93. when keys are listed, ie, via keys() or items() methods.
  94. Works by storing a lowercase version of the key as the new key and
  95. stores the original key-value pair as the key's value
  96. (values become dictionaries).
  97. """
  98. def __init__(self, initval=None):
  99. """Create new case-insensitive dictionary."""
  100. if isinstance(initval, dict):
  101. for key, value in initval.iteritems():
  102. self.__setitem__(key, value)
  103. elif isinstance(initval, list):
  104. for (key, value) in initval:
  105. self.__setitem__(key, value)
  106. def __contains__(self, key):
  107. return dict.__contains__(self, key.lower())
  108. def __getitem__(self, key):
  109. return dict.__getitem__(self, key.lower())['val']
  110. def __setitem__(self, key, value):
  111. return dict.__setitem__(self, key.lower(), {'key': key, 'val': value})
  112. def get(self, key, default=None):
  113. try:
  114. v = dict.__getitem__(self, key.lower())
  115. except KeyError:
  116. return default
  117. else:
  118. return v['val']
  119. def update(self, other):
  120. for k, v in other.items():
  121. self[k] = v
  122. def items(self):
  123. return [(v['key'], v['val']) for v in dict.itervalues(self)]
  124. def keys(self):
  125. return [v['key'] for v in dict.itervalues(self)]
  126. def values(self):
  127. return [v['val'] for v in dict.itervalues(self)]
  128. def iteritems(self):
  129. for v in dict.itervalues(self):
  130. yield v['key'], v['val']
  131. def iterkeys(self):
  132. for v in dict.itervalues(self):
  133. yield v['key']
  134. def itervalues(self):
  135. for v in dict.itervalues(self):
  136. yield v['val']
  137. class Response(object):
  138. """
  139. Returned by :func:`request` / :func:`get` / :func:`post` functions.
  140. Simplified version of the ``Response`` object in the ``requests`` library.
  141. >>> r = request('http://www.google.com')
  142. >>> r.status_code
  143. 200
  144. >>> r.encoding
  145. ISO-8859-1
  146. >>> r.content # bytes
  147. <html> ...
  148. >>> r.text # unicode, decoded according to charset in HTTP header/meta tag
  149. u'<html> ...'
  150. >>> r.json() # content parsed as JSON
  151. """
  152. def __init__(self, request, stream=False):
  153. """Call `request` with :mod:`urllib2` and process results.
  154. :param request: :class:`urllib2.Request` instance
  155. :param stream: Whether to stream response or retrieve it all at once
  156. :type stream: ``bool``
  157. """
  158. self.request = request
  159. self._stream = stream
  160. self.url = None
  161. self.raw = None
  162. self._encoding = None
  163. self.error = None
  164. self.status_code = None
  165. self.reason = None
  166. self.headers = CaseInsensitiveDictionary()
  167. self._content = None
  168. self._content_loaded = False
  169. self._gzipped = False
  170. # Execute query
  171. try:
  172. self.raw = urllib2.urlopen(request)
  173. except urllib2.HTTPError as err:
  174. self.error = err
  175. try:
  176. self.url = err.geturl()
  177. # sometimes (e.g. when authentication fails)
  178. # urllib can't get a URL from an HTTPError
  179. # This behaviour changes across Python versions,
  180. # so no test cover (it isn't important).
  181. except AttributeError: # pragma: no cover
  182. pass
  183. self.status_code = err.code
  184. else:
  185. self.status_code = self.raw.getcode()
  186. self.url = self.raw.geturl()
  187. self.reason = RESPONSES.get(self.status_code)
  188. # Parse additional info if request succeeded
  189. if not self.error:
  190. headers = self.raw.info()
  191. self.transfer_encoding = headers.getencoding()
  192. self.mimetype = headers.gettype()
  193. for key in headers.keys():
  194. self.headers[key.lower()] = headers.get(key)
  195. # Is content gzipped?
  196. # Transfer-Encoding appears to not be used in the wild
  197. # (contrary to the HTTP standard), but no harm in testing
  198. # for it
  199. if ('gzip' in headers.get('content-encoding', '') or
  200. 'gzip' in headers.get('transfer-encoding', '')):
  201. self._gzipped = True
  202. @property
  203. def stream(self):
  204. """Whether response is streamed.
  205. Returns:
  206. bool: `True` if response is streamed.
  207. """
  208. return self._stream
  209. @stream.setter
  210. def stream(self, value):
  211. if self._content_loaded:
  212. raise RuntimeError("`content` has already been read from "
  213. "this Response.")
  214. self._stream = value
  215. def json(self):
  216. """Decode response contents as JSON.
  217. :returns: object decoded from JSON
  218. :rtype: :class:`list` / :class:`dict`
  219. """
  220. return json.loads(self.content, self.encoding or 'utf-8')
  221. @property
  222. def encoding(self):
  223. """Text encoding of document or ``None``.
  224. :returns: :class:`str` or ``None``
  225. """
  226. if not self._encoding:
  227. self._encoding = self._get_encoding()
  228. return self._encoding
  229. @property
  230. def content(self):
  231. """Raw content of response (i.e. bytes).
  232. :returns: Body of HTTP response
  233. :rtype: :class:`str`
  234. """
  235. if not self._content:
  236. # Decompress gzipped content
  237. if self._gzipped:
  238. decoder = zlib.decompressobj(16 + zlib.MAX_WBITS)
  239. self._content = decoder.decompress(self.raw.read())
  240. else:
  241. self._content = self.raw.read()
  242. self._content_loaded = True
  243. return self._content
  244. @property
  245. def text(self):
  246. """Unicode-decoded content of response body.
  247. If no encoding can be determined from HTTP headers or the content
  248. itself, the encoded response body will be returned instead.
  249. :returns: Body of HTTP response
  250. :rtype: :class:`unicode` or :class:`str`
  251. """
  252. if self.encoding:
  253. return unicodedata.normalize('NFC', unicode(self.content,
  254. self.encoding))
  255. return self.content
  256. def iter_content(self, chunk_size=4096, decode_unicode=False):
  257. """Iterate over response data.
  258. .. versionadded:: 1.6
  259. :param chunk_size: Number of bytes to read into memory
  260. :type chunk_size: ``int``
  261. :param decode_unicode: Decode to Unicode using detected encoding
  262. :type decode_unicode: ``Boolean``
  263. :returns: iterator
  264. """
  265. if not self.stream:
  266. raise RuntimeError("You cannot call `iter_content` on a "
  267. "Response unless you passed `stream=True`"
  268. " to `get()`/`post()`/`request()`.")
  269. if self._content_loaded:
  270. raise RuntimeError(
  271. "`content` has already been read from this Response.")
  272. def decode_stream(iterator, r):
  273. decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace')
  274. for chunk in iterator:
  275. data = decoder.decode(chunk)
  276. if data:
  277. yield data
  278. data = decoder.decode(b'', final=True)
  279. if data: # pragma: no cover
  280. yield data
  281. def generate():
  282. if self._gzipped:
  283. decoder = zlib.decompressobj(16 + zlib.MAX_WBITS)
  284. while True:
  285. chunk = self.raw.read(chunk_size)
  286. if not chunk:
  287. break
  288. if self._gzipped:
  289. chunk = decoder.decompress(chunk)
  290. yield chunk
  291. chunks = generate()
  292. if decode_unicode and self.encoding:
  293. chunks = decode_stream(chunks, self)
  294. return chunks
  295. def save_to_path(self, filepath):
  296. """Save retrieved data to file at ``filepath``.
  297. .. versionadded: 1.9.6
  298. :param filepath: Path to save retrieved data.
  299. """
  300. filepath = os.path.abspath(filepath)
  301. dirname = os.path.dirname(filepath)
  302. if not os.path.exists(dirname):
  303. os.makedirs(dirname)
  304. self.stream = True
  305. with open(filepath, 'wb') as fileobj:
  306. for data in self.iter_content():
  307. fileobj.write(data)
  308. def raise_for_status(self):
  309. """Raise stored error if one occurred.
  310. error will be instance of :class:`urllib2.HTTPError`
  311. """
  312. if self.error is not None:
  313. raise self.error
  314. return
  315. def _get_encoding(self):
  316. """Get encoding from HTTP headers or content.
  317. :returns: encoding or `None`
  318. :rtype: ``unicode`` or ``None``
  319. """
  320. headers = self.raw.info()
  321. encoding = None
  322. if headers.getparam('charset'):
  323. encoding = headers.getparam('charset')
  324. # HTTP Content-Type header
  325. for param in headers.getplist():
  326. if param.startswith('charset='):
  327. encoding = param[8:]
  328. break
  329. if not self.stream: # Try sniffing response content
  330. # Encoding declared in document should override HTTP headers
  331. if self.mimetype == 'text/html': # sniff HTML headers
  332. m = re.search("""<meta.+charset=["']{0,1}(.+?)["'].*>""",
  333. self.content)
  334. if m:
  335. encoding = m.group(1)
  336. elif ((self.mimetype.startswith('application/') or
  337. self.mimetype.startswith('text/')) and
  338. 'xml' in self.mimetype):
  339. m = re.search("""<?xml.+encoding=["'](.+?)["'][^>]*\?>""",
  340. self.content)
  341. if m:
  342. encoding = m.group(1)
  343. # Format defaults
  344. if self.mimetype == 'application/json' and not encoding:
  345. # The default encoding for JSON
  346. encoding = 'utf-8'
  347. elif self.mimetype == 'application/xml' and not encoding:
  348. # The default for 'application/xml'
  349. encoding = 'utf-8'
  350. if encoding:
  351. encoding = encoding.lower()
  352. return encoding
  353. def request(method, url, params=None, data=None, headers=None, cookies=None,
  354. files=None, auth=None, timeout=60, allow_redirects=False,
  355. stream=False):
  356. """Initiate an HTTP(S) request. Returns :class:`Response` object.
  357. :param method: 'GET' or 'POST'
  358. :type method: ``unicode``
  359. :param url: URL to open
  360. :type url: ``unicode``
  361. :param params: mapping of URL parameters
  362. :type params: :class:`dict`
  363. :param data: mapping of form data ``{'field_name': 'value'}`` or
  364. :class:`str`
  365. :type data: :class:`dict` or :class:`str`
  366. :param headers: HTTP headers
  367. :type headers: :class:`dict`
  368. :param cookies: cookies to send to server
  369. :type cookies: :class:`dict`
  370. :param files: files to upload (see below).
  371. :type files: :class:`dict`
  372. :param auth: username, password
  373. :type auth: ``tuple``
  374. :param timeout: connection timeout limit in seconds
  375. :type timeout: ``int``
  376. :param allow_redirects: follow redirections
  377. :type allow_redirects: ``Boolean``
  378. :param stream: Stream content instead of fetching it all at once.
  379. :type stream: ``bool``
  380. :returns: :class:`Response` object
  381. The ``files`` argument is a dictionary::
  382. {'fieldname' : { 'filename': 'blah.txt',
  383. 'content': '<binary data>',
  384. 'mimetype': 'text/plain'}
  385. }
  386. * ``fieldname`` is the name of the field in the HTML form.
  387. * ``mimetype`` is optional. If not provided, :mod:`mimetypes` will
  388. be used to guess the mimetype, or ``application/octet-stream``
  389. will be used.
  390. """
  391. # TODO: cookies
  392. socket.setdefaulttimeout(timeout)
  393. # Default handlers
  394. openers = []
  395. if not allow_redirects:
  396. openers.append(NoRedirectHandler())
  397. if auth is not None: # Add authorisation handler
  398. username, password = auth
  399. password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
  400. password_manager.add_password(None, url, username, password)
  401. auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
  402. openers.append(auth_manager)
  403. # Install our custom chain of openers
  404. opener = urllib2.build_opener(*openers)
  405. urllib2.install_opener(opener)
  406. if not headers:
  407. headers = CaseInsensitiveDictionary()
  408. else:
  409. headers = CaseInsensitiveDictionary(headers)
  410. if 'user-agent' not in headers:
  411. headers['user-agent'] = USER_AGENT
  412. # Accept gzip-encoded content
  413. encodings = [s.strip() for s in
  414. headers.get('accept-encoding', '').split(',')]
  415. if 'gzip' not in encodings:
  416. encodings.append('gzip')
  417. headers['accept-encoding'] = ', '.join(encodings)
  418. # Force POST by providing an empty data string
  419. if method == 'POST' and not data:
  420. data = ''
  421. if files:
  422. if not data:
  423. data = {}
  424. new_headers, data = encode_multipart_formdata(data, files)
  425. headers.update(new_headers)
  426. elif data and isinstance(data, dict):
  427. data = urllib.urlencode(str_dict(data))
  428. # Make sure everything is encoded text
  429. headers = str_dict(headers)
  430. if isinstance(url, unicode):
  431. url = url.encode('utf-8')
  432. if params: # GET args (POST args are handled in encode_multipart_formdata)
  433. scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
  434. if query: # Combine query string and `params`
  435. url_params = urlparse.parse_qs(query)
  436. # `params` take precedence over URL query string
  437. url_params.update(params)
  438. params = url_params
  439. query = urllib.urlencode(str_dict(params), doseq=True)
  440. url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
  441. req = urllib2.Request(url, data, headers)
  442. return Response(req, stream)
  443. def get(url, params=None, headers=None, cookies=None, auth=None,
  444. timeout=60, allow_redirects=True, stream=False):
  445. """Initiate a GET request. Arguments as for :func:`request`.
  446. :returns: :class:`Response` instance
  447. """
  448. return request('GET', url, params, headers=headers, cookies=cookies,
  449. auth=auth, timeout=timeout, allow_redirects=allow_redirects,
  450. stream=stream)
  451. def post(url, params=None, data=None, headers=None, cookies=None, files=None,
  452. auth=None, timeout=60, allow_redirects=False, stream=False):
  453. """Initiate a POST request. Arguments as for :func:`request`.
  454. :returns: :class:`Response` instance
  455. """
  456. return request('POST', url, params, data, headers, cookies, files, auth,
  457. timeout, allow_redirects, stream)
  458. def encode_multipart_formdata(fields, files):
  459. """Encode form data (``fields``) and ``files`` for POST request.
  460. :param fields: mapping of ``{name : value}`` pairs for normal form fields.
  461. :type fields: :class:`dict`
  462. :param files: dictionary of fieldnames/files elements for file data.
  463. See below for details.
  464. :type files: :class:`dict` of :class:`dicts`
  465. :returns: ``(headers, body)`` ``headers`` is a :class:`dict` of HTTP headers
  466. :rtype: 2-tuple ``(dict, str)``
  467. The ``files`` argument is a dictionary::
  468. {'fieldname' : { 'filename': 'blah.txt',
  469. 'content': '<binary data>',
  470. 'mimetype': 'text/plain'}
  471. }
  472. - ``fieldname`` is the name of the field in the HTML form.
  473. - ``mimetype`` is optional. If not provided, :mod:`mimetypes` will be used to guess the mimetype, or ``application/octet-stream`` will be used.
  474. """
  475. def get_content_type(filename):
  476. """Return or guess mimetype of ``filename``.
  477. :param filename: filename of file
  478. :type filename: unicode/string
  479. :returns: mime-type, e.g. ``text/html``
  480. :rtype: :class::class:`str`
  481. """
  482. return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
  483. boundary = '-----' + ''.join(random.choice(BOUNDARY_CHARS)
  484. for i in range(30))
  485. CRLF = '\r\n'
  486. output = []
  487. # Normal form fields
  488. for (name, value) in fields.items():
  489. if isinstance(name, unicode):
  490. name = name.encode('utf-8')
  491. if isinstance(value, unicode):
  492. value = value.encode('utf-8')
  493. output.append('--' + boundary)
  494. output.append('Content-Disposition: form-data; name="%s"' % name)
  495. output.append('')
  496. output.append(value)
  497. # Files to upload
  498. for name, d in files.items():
  499. filename = d[u'filename']
  500. content = d[u'content']
  501. if u'mimetype' in d:
  502. mimetype = d[u'mimetype']
  503. else:
  504. mimetype = get_content_type(filename)
  505. if isinstance(name, unicode):
  506. name = name.encode('utf-8')
  507. if isinstance(filename, unicode):
  508. filename = filename.encode('utf-8')
  509. if isinstance(mimetype, unicode):
  510. mimetype = mimetype.encode('utf-8')
  511. output.append('--' + boundary)
  512. output.append('Content-Disposition: form-data; '
  513. 'name="%s"; filename="%s"' % (name, filename))
  514. output.append('Content-Type: %s' % mimetype)
  515. output.append('')
  516. output.append(content)
  517. output.append('--' + boundary + '--')
  518. output.append('')
  519. body = CRLF.join(output)
  520. headers = {
  521. 'Content-Type': 'multipart/form-data; boundary=%s' % boundary,
  522. 'Content-Length': str(len(body)),
  523. }
  524. return (headers, body)