diff options
Diffstat (limited to 'src/libwebvi/webvi/download.py')
-rw-r--r-- | src/libwebvi/webvi/download.py | 470 |
1 files changed, 470 insertions, 0 deletions
diff --git a/src/libwebvi/webvi/download.py b/src/libwebvi/webvi/download.py new file mode 100644 index 0000000..34240ff --- /dev/null +++ b/src/libwebvi/webvi/download.py @@ -0,0 +1,470 @@ +# download.py - webvi downloader backend +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import os +import asyncore +import asynchat +import cStringIO +import urllib +import subprocess +import socket +import signal +import pycurl +import asyncurl +import utils +import version + +WEBVID_USER_AGENT = 'libwebvi/%s %s' % (version.VERSION, pycurl.version) +MOZILLA_USER_AGENT = 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5' + +try: + from libmimms import libmms +except ImportError, e: + pass + +# Mapping from curl error codes to webvi errors. The error constants +# are defined only in pycurl 7.16.1 and newer. +if pycurl.version_info()[2] >= 0x071001: + CURL_ERROR_CODE_MAPPING = \ + {pycurl.E_OK: 0, + pycurl.E_OPERATION_TIMEOUTED: 408, + pycurl.E_OUT_OF_MEMORY: 500, + pycurl.E_PARTIAL_FILE: 504, + pycurl.E_READ_ERROR: 504, + pycurl.E_RECV_ERROR: 504, + pycurl.E_REMOTE_FILE_NOT_FOUND: 404, + pycurl.E_TOO_MANY_REDIRECTS: 404, + pycurl.E_UNSUPPORTED_PROTOCOL: 500, + pycurl.E_URL_MALFORMAT: 400, + pycurl.E_COULDNT_CONNECT: 506, + pycurl.E_COULDNT_RESOLVE_HOST: 506, + pycurl.E_COULDNT_RESOLVE_PROXY: 506, + pycurl.E_FILE_COULDNT_READ_FILE: 404, + pycurl.E_GOT_NOTHING: 504, + pycurl.E_HTTP_RETURNED_ERROR: 404, + pycurl.E_INTERFACE_FAILED: 506, + pycurl.E_LOGIN_DENIED: 403} +else: + CURL_ERROR_CODE_MAPPING = {pycurl.E_OK: 0} + +class DownloaderException(Exception): + def __init__(self, errcode, errmsg): + self.code = errcode + self.msg = errmsg + + def __str__(self): + return '%s %s' % (self.code, self.msg) + +def create_downloader(url, templatedir, writefunc=None, headerfunc=None, + donefunc=None, HTTPheaders=None, headers_only=False): + """Downloader factory. + + Returns a suitable downloader object according to url type. Raises + DownloaderException if creating the downloader fails. + """ + if url == '': + return DummyDownloader('', writefunc, headerfunc, donefunc, + headers_only) + + elif url.startswith('mms://') or url.startswith('mmsh://'): + try: + libmms + except (NameError, OSError): + raise DownloaderException(501, 'MMS scheme not supported. Install mimms.') + return MMSDownload(url, writefunc, headerfunc, donefunc, + headers_only) + + elif url.startswith('wvt://'): + executable, parameters = parse_external_downloader_wvt_uri(url, templatedir) + if executable is None: + raise DownloaderException(400, 'Invalid wvt:// URL') + try: + return ExternalDownloader(executable, parameters, writefunc, + headerfunc, donefunc, headers_only) + except OSError, (errno, strerror): + raise DownloaderException(500, 'Failed to execute %s: %s' % + (executable, strerror)) + + else: + return CurlDownload(url, writefunc, headerfunc, donefunc, + HTTPheaders, headers_only) + +def convert_curl_error(err, errmsg, aborted): + """Convert a curl error code err to webvi error code.""" + if err == pycurl.E_WRITE_ERROR: + return (402, 'Aborted') + elif err not in CURL_ERROR_CODE_MAPPING: + return (500, errmsg) + else: + return (CURL_ERROR_CODE_MAPPING[err], errmsg) + +def parse_external_downloader_wvt_uri(url, templatedir): + exe = None + params = [] + if not url.startswith('wvt:///bin/'): + return (exe, params) + + split = url[len('wvt:///bin/'):].split('?', 1) + exe = templatedir + '/bin/' + split[0] + + if len(split) > 1: + params = [urllib.unquote(x) for x in split[1].split('&')] + + return (exe, params) + +def _new_process_group(): + os.setpgid(0, 0) + +class DownloaderBase: + """Base class for downloaders.""" + def __init__(self, url): + self.url = url + + def start(self): + """Should start the download process.""" + pass + + def abort(self): + """Signals that the download should be aborted.""" + pass + + def get_url(self): + """Return the URL where the data was downloaded.""" + return self.url + + def get_body(self): + return '' + + def get_encoding(self): + """Return the encoding of the downloaded object, or None if + encoding is not known.""" + return None + + +class DummyDownloader(DownloaderBase, asyncore.file_dispatcher): + """This class doesn't actually download anything, but returns msg + string as if it had been result of a download operation. + """ + def __init__(self, msg, writefunc=None, headerfunc=None, + donefunc=None, headers_only=False): + DownloaderBase.__init__(self, '') + self.donefunc = donefunc + self.writefunc = writefunc + self.headers_only = headers_only + + readfd, writefd = os.pipe() + asyncore.file_dispatcher.__init__(self, readfd) + os.write(writefd, msg) + os.close(writefd) + + def set_file(self, fd): + # Like asyncore.file_dispatcher.set_file() but doesn't call + # add_channel(). We'll call add_channel() in start() when the + # download shall begin. + self.socket = asyncore.file_wrapper(fd) + self._fileno = self.socket.fileno() + + def start(self): + if self.headers_only: + self.donefunc(0, None) + else: + self.add_channel() + + def readable(self): + return True + + def writable(self): + return False + + def handle_read(self): + try: + data = self.recv(4096) + if data and self.writefunc is not None: + self.writefunc(data) + except socket.error: + self.handle_error() + + def handle_close(self): + self.close() + + if self.donefunc is not None: + self.donefunc(0, '') + + +class CurlDownload(DownloaderBase, asyncurl.async_curl_dispatcher): + """Downloads a large number of different URL schemes using + libcurl.""" + def __init__(self, url, writefunc=None, headerfunc=None, + donefunc=None, HTTPheaders=None, headers_only=False): + DownloaderBase.__init__(self, url) + asyncurl.async_curl_dispatcher.__init__(self, url, False) + self.donefunc = donefunc + self.writefunc = writefunc + self.contenttype = None + self.running = True + self.aborted = False + + self.curl.setopt(pycurl.USERAGENT, WEBVID_USER_AGENT) + if headers_only: + self.curl.setopt(pycurl.NOBODY, 1) + if headerfunc is not None: + self.curl.setopt(pycurl.HEADERFUNCTION, headerfunc) + self.curl.setopt(pycurl.WRITEFUNCTION, self.writewrapper) + + headers = [] + if HTTPheaders is not None: + for headername, headerdata in HTTPheaders.iteritems(): + if headername == 'cookie': + self.curl.setopt(pycurl.COOKIE, headerdata) + else: + headers.append(headername + ': ' + headerdata) + + self.curl.setopt(pycurl.HTTPHEADER, headers) + + def start(self): + self.add_channel() + + def close(self): + self.contenttype = self.curl.getinfo(pycurl.CONTENT_TYPE) + asyncurl.async_curl_dispatcher.close(self) + self.running = False + + def abort(self): + self.aborted = True + + def writewrapper(self, data): + if self.aborted: + return 0 + + if self.writefunc is None: + return self.write_to_buf(data) + else: + return self.writefunc(data) + + def get_body(self): + return self.buffer.getvalue() + + def get_encoding(self): + if self.running: + self.contenttype = self.curl.getinfo(pycurl.CONTENT_TYPE) + + if self.contenttype is None: + return None + + values = self.contenttype.split(';', 1) + if len(values) > 1: + for par in values[1].split(' '): + if par.startswith('charset='): + return par[len('charset='):].strip('"') + + return None + + def handle_read(self): + # Do nothing to the read data here. Instead, let the base + # class to collect the data to self.buffer. + pass + + def handle_completed(self, err, errmsg): + asyncurl.async_curl_dispatcher.handle_completed(self, err, errmsg) + if self.donefunc is not None: + err, errmsg = convert_curl_error(err, errmsg, self.aborted) + self.donefunc(err, errmsg) + + +class MMSDownload(DownloaderBase, asyncore.file_dispatcher): + def __init__(self, url, writefunc=None, headerfunc=None, + donefunc=None, headers_only=False): + DownloaderBase.__init__(self, url) + self.r, self.w = os.pipe() + asyncore.file_dispatcher.__init__(self, self.r) + + self.writefunc = writefunc + self.headerfunc = headerfunc + self.donefunc = donefunc + self.relaylen = -1 + self.expectedlen = -1 + self.headers_only = headers_only + self.stream = None + self.errmsg = None + self.aborted = False + + def set_file(self, fd): + self.socket = asyncore.file_wrapper(fd) + self._fileno = self.socket.fileno() + + def recv(self, buffer_size): + data = self.stream.read() + if not data: + self.handle_close() + return '' + else: + return data + + def close(self): + if self.stream is not None: + self.stream.close() + + os.close(self.w) + asyncore.file_dispatcher.close(self) + + def readable(self): + return self.stream is not None + + def writable(self): + return False + + def start(self): + try: + self.stream = libmms.Stream(self.url, 1000000) + except libmms.Error, e: + self.errmsg = e.message + self.handle_close() + return + + os.write(self.w, '0') # signal that this dispatcher has data available + + if self.headerfunc: + # Output the length in a HTTP-like header field so that we + # can use the same callbacks as with HTTP downloads. + ext = utils.get_url_extension(self.url) + if ext == 'wma': + self.headerfunc('Content-Type: audio/x-ms-wma') + else: # if ext == 'wmv': + self.headerfunc('Content-Type: video/x-ms-wmv') + self.headerfunc('Content-Length: %d' % self.stream.length()) + + if self.headers_only: + self.handle_close() + else: + self.add_channel() + + def abort(self): + self.aborted = True + + def handle_read(self): + if self.aborted: + self.handle_close() + return '' + + try: + data = self.recv(4096) + if data and (self.writefunc is not None): + self.writefunc(data) + except libmms.Error, e: + self.errmsg = e.message + self.handle_close() + return + + def handle_close(self): + self.close() + self.stream = None + + if self.errmsg is not None: + self.donefunc(500, self.errmsg) + elif self.aborted: + self.donefunc(402, 'Aborted') + elif self.relaylen < self.expectedlen: + # We got fewer bytes than expected. Maybe the connection + # was lost? + self.donefunc(504, 'Download may be incomplete (length %d < %d)' % + (self.relaylen, self.expectedlen)) + else: + self.donefunc(0, '') + + +class ExternalDownloader(DownloaderBase, asyncore.file_dispatcher): + """Executes an external process and reads its result on standard + output.""" + def __init__(self, executable, parameters, writefunc=None, + headerfunc=None, donefunc=None, headers_only=False): + DownloaderBase.__init__(self, '') + asyncore.dispatcher.__init__(self, None, None) + self.executable = executable + self.writefunc = writefunc + self.headerfunc = headerfunc + self.donefunc = donefunc + self.headers_only = headers_only + self.contenttype = '' + self.aborted = False + + args = [] + for par in parameters: + try: + key, val = par.split('=', 1) + if key == 'contenttype': + self.contenttype = val + elif key == 'arg': + args.append(val) + except ValueError: + pass + + if args: + self.url = args[0] + else: + self.url = executable + self.cmd = [executable] + args + + self.process = None + + def start(self): + self.headerfunc('Content-Type: ' + self.contenttype) + + if self.headers_only: + self.donefunc(0, None) + return + + self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, + close_fds=True, + preexec_fn=_new_process_group) + asyncore.file_dispatcher.__init__(self, os.dup(self.process.stdout.fileno())) + + def abort(self): + if self.process is not None: + self.aborted = True + pg = os.getpgid(self.process.pid) + os.killpg(pg, signal.SIGTERM) + + def readable(self): + # Return True if the subprocess is still alive + return self.process is not None and self.process.returncode is None + + def writable(self): + return False + + def handle_read(self): + try: + data = self.recv(4096) + if data and self.writefunc is not None: + self.writefunc(data) + except socket.error: + self.handle_error() + return + + def handle_close(self): + self.close() + self.process.wait() + + if self.donefunc is not None: + if self.process.returncode == 0: + self.donefunc(0, '') + elif self.aborted and self.process.returncode == -signal.SIGTERM: + self.donefunc(402, 'Aborted') + else: + self.donefunc(500, 'Child process "%s" returned error %s' % \ + (' '.join(self.cmd), str(self.process.returncode))) + + self.process = None |