diff options
Diffstat (limited to 'src/libwebvi/webvi')
-rw-r--r-- | src/libwebvi/webvi/__init__.py | 1 | ||||
-rw-r--r-- | src/libwebvi/webvi/api.py | 289 | ||||
-rw-r--r-- | src/libwebvi/webvi/asyncurl.py | 389 | ||||
-rw-r--r-- | src/libwebvi/webvi/constants.py | 50 | ||||
-rw-r--r-- | src/libwebvi/webvi/download.py | 470 | ||||
-rw-r--r-- | src/libwebvi/webvi/json2xml.py | 69 | ||||
-rw-r--r-- | src/libwebvi/webvi/request.py | 617 | ||||
-rw-r--r-- | src/libwebvi/webvi/utils.py | 134 | ||||
-rw-r--r-- | src/libwebvi/webvi/version.py | 20 |
9 files changed, 2039 insertions, 0 deletions
diff --git a/src/libwebvi/webvi/__init__.py b/src/libwebvi/webvi/__init__.py new file mode 100644 index 0000000..b6d50d5 --- /dev/null +++ b/src/libwebvi/webvi/__init__.py @@ -0,0 +1 @@ +__all__ = ['api', 'asyncurl', 'constants', 'download', 'request', 'utils'] diff --git a/src/libwebvi/webvi/api.py b/src/libwebvi/webvi/api.py new file mode 100644 index 0000000..2fb24ab --- /dev/null +++ b/src/libwebvi/webvi/api.py @@ -0,0 +1,289 @@ +# api.py - webvi API +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +"""webvi API + +Example workflow: + +1) Create a new request. ref is a wvt:// URI. + +handle = new_request(ref, WebviRequestType.MENU) + +2) Setup a callback function: + +setopt(handle, WebviOpt.WRITEFUNC, my_callback) + +3) Start the network transfer: + +start_handle(handle) + +4) Get active file descriptors, wait for activity on them, and let +webvi process the file descriptor. + +import select + +... + +readfd, writefd, excfd = fdset()[1:4] +readfd, writefd, excfd = select.select(readfd, writefd, excfd, 5.0) +for fd in readfd: + perform(fd, WebviSelectBitmask.READ) +for fd in writefd: + perform(fd, WebviSelectBitmask.WRITE) + +5) Iterate 4) until pop_message returns handle, which indicates that +the request has been completed. + +finished, status, errmsg, remaining = pop_message() +if finished == handle: + print 'done' +""" + +import request +import asyncore +import asyncurl +from constants import WebviErr, WebviOpt, WebviInfo, WebviSelectBitmask, WebviConfig + +# Human readable messages for WebviErr items +error_messages = { + WebviErr.OK: 'Succeeded', + WebviErr.INVALID_HANDLE: 'Invalid handle', + WebviErr.INVALID_PARAMETER: "Invalid parameter", + WebviErr.INTERNAL_ERROR: "Internal error" + } + +# Module-level variables +finished_queue = [] +request_list = request.RequestList() +socket_map = asyncore.socket_map + +# Internal functions + +class MyRequest(request.Request): + def request_done(self, err, errmsg): + """Calls the inherited function and puts the handle of the + finished request to the finished_queue.""" + finished_queue.append(self) + request.Request.request_done(self, err, errmsg) + +# Public functions + +def strerror(err): + """Return human readable error message for conststants.WebviErr""" + try: + return error_messages[err] + except KeyError: + return error_messages[WebviErr.INTERNAL_ERROR] + +def set_config(conf, value): + """Set a new value for a global configuration option conf. + + Currently the only legal value for conf is + constants.WebviConfig.TEMPLATE_PATH, which sets the base directory + for the XSLT templates. + """ + if conf == WebviConfig.TEMPLATE_PATH: + request.set_template_path(value) + return WebviErr.OK + else: + return WebviErr.INVALID_PARAMETER + +def new_request(reference, reqtype): + """Create a new request. + + reference is a wvt:// URI which typically comes from previously + opened menu. reqtype is one of conststants.WebviRequestType and + indicates wheter the reference is a navigation menu, stream that + should be downloaded, or a stream whose URL should be returned. + + Returns a handle (an integer) will be given to following + functions. Return value -1 indicates an error. + """ + req = MyRequest(reference, reqtype) + + if req.srcurl is None: + return -1 + + return request_list.put(req) + +def set_opt(handle, option, value): + """Set configuration options on a handle. + + option specifies option's name (one of constants.WebviOpt values) + and value is the new value for the option. + """ + + try: + req = request_list[handle] + except KeyError: + return WebviErr.INVALID_HANDLE + + if option == WebviOpt.WRITEFUNC: + req.writefunc = value + elif option == WebviOpt.WRITEDATA: + req.writedata = value + elif option == WebviOpt.READFUNC: + req.readfunc = value + elif option == WebviOpt.READDATA: + req.readdata = value + else: + return WebviErr.INVALID_PARAMETER + + return WebviErr.OK + +def get_info(handle, info): + """Get information about a handle. + + info is the type of data that is to be returned (one of + constants.WebviInfo values). + """ + try: + req = request_list[handle] + except KeyError: + return (WebviErr.INVALID_HANDLE, None) + + val = None + if info == WebviInfo.URL: + if req.dl is not None: + val = req.dl.get_url() + else: + val = req.srcurl + elif info == WebviInfo.CONTENT_LENGTH: + val = req.contentlength + elif info == WebviInfo.CONTENT_TYPE: + val = req.contenttype + elif info == WebviInfo.STREAM_TITLE: + val = req.streamtitle + else: + return (WebviErr.INVALID_PARAMETER, None) + + return (WebviErr.OK, val) + +def start_handle(handle): + """Start the network transfer on a handle.""" + try: + req = request_list[handle] + except KeyError: + return WebviErr.INVALID_HANDLE + + req.start() + return WebviErr.OK + +def stop_handle(handle): + """Aborts network transfer on a handle. + + The abort is confirmed by pop_message() returning the handle with + an non-zero error code. + """ + try: + req = request_list[handle] + except KeyError: + return WebviErr.INVALID_HANDLE + + if not req.is_finished(): + req.stop() + + return WebviErr.OK + +def delete_handle(handle): + """Frees resources related to handle. + + This should be called when the transfer has been completed and the + user is done with the handle. If the transfer is still in progress + when delete_handle() is called, the transfer is aborted. After + calling delete_handle() the handle value will be invalid, and + should not be feed to other functions anymore. + """ + try: + del request_list[handle] + except KeyError: + return WebviErr.INVALID_HANDLE + + return WebviErr.OK + +def pop_message(): + """Retrieve messages about finished requests. + + If a request has been finished since the last call to this + function, returns a tuple (handle, status, msg, num_messages), + where handle identifies the finished request, status is a numeric + status code (non-zero for an error), msg is a description of an + error as string, and num_messages is the number of messages that + can be retrieved by calling pop_messages() again immediately. If + the finished requests queue is empty, returns (-1, -1, "", 0). + """ + if finished_queue: + req = finished_queue.pop() + return (req.handle, req.status, req.errmsg, len(finished_queue)) + else: + return (-1, -1, "", 0) + +def fdset(): + """Get the list of file descriptors that are currently in use by + the library. + + Returrns a tuple, where the first item is a constants.WebviErr + value indicating the success of the call, the next three values + are lists of descriptors that should be monitored for reading, + writing, and exceptional conditions, respectively. The last item + is the maximum of the file descriptors in the three lists. + """ + readfd = [] + writefd = [] + excfd = [] + maxfd = -1 + + for fd, disp in socket_map.iteritems(): + if disp.readable(): + readfd.append(fd) + if fd > maxfd: + maxfd = fd + if disp.writable(): + writefd.append(fd) + if fd > maxfd: + maxfd = fd + + return (WebviErr.OK, readfd, writefd, excfd, maxfd) + +def perform(fd, ev_bitmask): + """Perform transfer on file descriptor fd. + + fd is a file descriptor that has been signalled to be ready by + select() or similar system call. ev_bitmask specifies what kind of + activity has been detected using values of + constants.WebviSelectBitmask. If ev_bitmask is + constants.WebviSelectBitmask.TIMEOUT the type of activity is check + by the function. + + This function should be called every few seconds with fd=-1, + ev_bitmask=constants.WebviSelectBitmask.TIMEOUT even if no + activity has been signalled on the file descriptors to ensure + correct handling of timeouts and other internal processing. + """ + if fd < 0: + asyncurl.poll() + else: + disp = socket_map.get(fd) + if disp is not None: + if ev_bitmask & WebviSelectBitmask.READ != 0 or \ + (ev_bitmask == 0 and disp.readable()): + disp.handle_read_event() + if ev_bitmask & WebviSelectBitmask.WRITE != 0 or \ + (ev_bitmask == 0 and disp.writable()): + disp.handle_write_event() + + return (WebviErr.OK, len(socket_map)) diff --git a/src/libwebvi/webvi/asyncurl.py b/src/libwebvi/webvi/asyncurl.py new file mode 100644 index 0000000..afc575a --- /dev/null +++ b/src/libwebvi/webvi/asyncurl.py @@ -0,0 +1,389 @@ +# asyncurl.py - Wrapper class for using pycurl objects in asyncore +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +"""This is a wrapper for using pycurl objects in asyncore. + +Start a transfer by creating an async_curl_dispatch, and call +asyncurl.loop() instead of asyncore.loop(). +""" + +import asyncore +import pycurl +import traceback +import os +import select +import time +import cStringIO +from errno import EINTR + +SOCKET_TIMEOUT = pycurl.SOCKET_TIMEOUT +CSELECT_IN = pycurl.CSELECT_IN +CSELECT_OUT = pycurl.CSELECT_OUT +CSELECT_ERR = pycurl.CSELECT_ERR + +def poll(timeout=0.0, map=None, mdisp=None): + if map is None: + map = asyncore.socket_map + if mdisp is None: + mdisp = multi_dispatcher + if map: + timeout = min(timeout, mdisp.timeout/1000.0) + + r = []; w = []; e = [] + for fd, obj in map.items(): + is_r = obj.readable() + is_w = obj.writable() + if is_r: + r.append(fd) + if is_w: + w.append(fd) + if is_r or is_w: + e.append(fd) + if [] == r == w == e: + time.sleep(timeout) + else: + try: + r, w, e = select.select(r, w, e, timeout) + except select.error, err: + if err[0] != EINTR: + raise + else: + return + + if [] == r == w == e: + mdisp.socket_action(SOCKET_TIMEOUT, 0) + return + + for fd in r: + obj = map.get(fd) + if obj is None: + continue + asyncore.read(obj) + + for fd in w: + obj = map.get(fd) + if obj is None: + continue + asyncore.write(obj) + + for fd in e: + obj = map.get(fd) + if obj is None: + continue + asyncore._exception(obj) + +def loop(timeout=30.0, use_poll=False, map=None, count=None, mdisp=None): + if map is None: + map = asyncore.socket_map + if mdisp is None: + mdisp = multi_dispatcher + + if use_poll and hasattr(select, 'poll'): + print 'poll2 not implemented' + poll_fun = poll + + if count is None: + while map: + poll_fun(timeout, map, mdisp) + + else: + while map and count > 0: + poll_fun(timeout, map, mdisp) + count = count - 1 + +def noop_callback(s): + pass + + +class curl_multi_dispatcher: + """A dispatcher for pycurl.CurlMulti() objects. An instance of + this class is created automatically. There is usually no need to + construct one manually.""" + def __init__(self, socket_map=None): + if socket_map is None: + self._map = asyncore.socket_map + else: + self._map = socket_map + self.dispatchers = {} + self.timeout = 1000 + self._sockets_removed = False + self._curlm = pycurl.CurlMulti() + self._curlm.setopt(pycurl.M_SOCKETFUNCTION, self.socket_callback) + self._curlm.setopt(pycurl.M_TIMERFUNCTION, self.timeout_callback) + + def socket_callback(self, action, socket, user_data, socket_data): +# print 'socket callback: %d, %s' % \ +# (socket, {pycurl.POLL_NONE: "NONE", +# pycurl.POLL_IN: "IN", +# pycurl.POLL_OUT: "OUT", +# pycurl.POLL_INOUT: "INOUT", +# pycurl.POLL_REMOVE: "REMOVE"}[action]) + + if action == pycurl.POLL_NONE: + return + elif action == pycurl.POLL_REMOVE: + if socket in self._map: + del self._map[socket] + self._sockets_removed = True + return + + obj = self._map.get(socket) + if obj is None: + obj = dispatcher_wrapper(socket, self) + self._map[socket] = obj + + if action == pycurl.POLL_IN: + obj.set_readable(True) + obj.set_writable(False) + elif action == pycurl.POLL_OUT: + obj.set_readable(False) + obj.set_writable(True) + elif action == pycurl.POLL_INOUT: + obj.set_readable(True) + obj.set_writable(True) + + def timeout_callback(self, msec): + self.timeout = msec + + def attach(self, curldisp): + """Starts a transfer on curl handle by attaching it to this + multihandle.""" + self.dispatchers[curldisp.curl] = curldisp + try: + self._curlm.add_handle(curldisp.curl) + except pycurl.error: + # the curl object is already on this multi-stack + pass + + while self._curlm.socket_all()[0] == pycurl.E_CALL_MULTI_PERFORM: + pass + + self.check_completed(True) + + def detach(self, curldisp): + """Removes curl handle from this multihandle, and fire its + completion callback function.""" + self.del_curl(curldisp.curl) + + # libcurl does not send POLL_REMOVE when a handle is aborted + for socket, curlobj in self._map.items(): + if curlobj == curldisp: + + print 'handle stopped but socket in map' + + del self._map[socket] + break + + def del_curl(self, curl): + try: + self._curlm.remove_handle(curl) + except pycurl.error: + # the curl object is not on this multi-stack + pass + if curl in self.dispatchers: + del self.dispatchers[curl] + curl.close() + + def socket_action(self, fd, evbitmask): + res = -1 + OK = False + while not OK: + try: + res = self._curlm.socket_action(fd, evbitmask) + OK = True + except pycurl.error: + # Older libcurls may return CURLM_CALL_MULTI_PERFORM, + # which pycurl (at least 7.19.0) converts to an + # exception. If that happens, call socket_action + # again. + pass + return res + + def check_completed(self, force): + if not force and not self._sockets_removed: + return + self._sockets_removed = False + + nmsg, success, failed = self._curlm.info_read() + for handle in success: + disp = self.dispatchers.get(handle) + if disp is not None: + try: + disp.handle_completed(0, None) + except: + self.handle_error() + self.del_curl(handle) + for handle, err, errmsg in failed: + disp = self.dispatchers.get(handle) + if disp is not None: + try: + disp.handle_completed(err, errmsg) + except: + self.handle_error() + self.del_curl(handle) + + def handle_error(self): + print 'Exception occurred in multicurl processing' + print traceback.format_exc() + + +class dispatcher_wrapper: + """An internal helper class that connects a file descriptor in the + asyncore.socket_map to a curl_multi_dispatcher.""" + def __init__(self, fd, multicurl): + self.fd = fd + self.multicurl = multicurl + self.read_flag = False + self.write_flag = False + + def readable(self): + return self.read_flag + + def writable(self): + return self.write_flag + + def set_readable(self, x): + self.read_flag = x + + def set_writable(self, x): + self.write_flag = x + + def handle_read_event(self): + self.multicurl.socket_action(self.fd, CSELECT_IN) + self.multicurl.check_completed(False) + + def handle_write_event(self): + self.multicurl.socket_action(self.fd, CSELECT_OUT) + self.multicurl.check_completed(False) + + def handle_expt_event(self): + self.multicurl.socket_action(self.fd, CSELECT_ERR) + self.multicurl.check_completed(False) + + def handle_error(self): + print 'Exception occurred during processing of a curl request' + print traceback.format_exc() + + +class async_curl_dispatcher: + """A dispatcher class for pycurl transfers.""" + def __init__(self, url, auto_start=True): + """Initializes a pycurl object self.curl. The default is to + download url to an internal buffer whose content can be read + with self.recv(). If auto_start is False, the transfer is not + started before a call to add_channel(). + """ + self.url = url + self.socket = None + self.buffer = cStringIO.StringIO() + self.curl = pycurl.Curl() + self.curl.setopt(pycurl.URL, self.url) + self.curl.setopt(pycurl.FOLLOWLOCATION, 1) + self.curl.setopt(pycurl.AUTOREFERER, 1) + self.curl.setopt(pycurl.MAXREDIRS, 10) + self.curl.setopt(pycurl.FAILONERROR, 1) + self.curl.setopt(pycurl.WRITEFUNCTION, self.write_to_buf) + if auto_start: + self.add_channel() + + def write_to_buf(self, msg): + self.buffer.write(msg) + self.handle_read() + + def send(self, data): + raise NotImplementedError + + def recv(self, buffer_size): + # buffer_size is ignored + ret = self.buffer.getvalue() + self.buffer.reset() + self.buffer.truncate() + return ret + + def add_channel(self, multidisp=None): + if multidisp is None: + multidisp = multi_dispatcher + multidisp.attach(self) + + def del_channel(self, multidisp=None): + if multidisp is None: + multidisp = multi_dispatcher + multidisp.detach(self) + + def close(self): + self.del_channel() + + def log_info(self, message, type='info'): + if type != 'info': + print '%s: %s' % (type, message) + + def handle_error(self): + print 'Exception occurred during processing of a curl request' + print traceback.format_exc() + self.close() + + def handle_read(self): + self.log_info('unhandled read event', 'warning') + + def handle_write(self): + self.log_info('unhandled write event', 'warning') + + def handle_completed(self, err, errmsg): + """Called when the download has finished. err is a numeric + error code (or 0 if the download was successfull) and errmsg + is a curl error message as a string.""" + # It seems that a reference to self.write_to_buf forbids + # garbage collection from deleting this object. unsetopt() or + # setting the callback to None are not allowed. Is there a + # better way? + self.curl.setopt(pycurl.WRITEFUNCTION, noop_callback) + self.close() + + +def test(): + + class curl_request(async_curl_dispatcher): + def __init__(self, url, outfile, i): + async_curl_dispatcher.__init__(self, url, False) + self.id = i + self.outfile = outfile + self.add_channel() + + def handle_read(self): + buf = self.recv(4096) + print '%s: writing %d bytes' % (self.id, len(buf)) + self.outfile.write(buf) + + def handle_completed(self, err, errmsg): + if err != 0: + print '%s: error: %d %s' % (self.id, err, errmsg) + else: + print '%s: completed' % self.id + + curl_request('http://www.python.org', open('python.out', 'w'), 1) + curl_request('http://en.wikipedia.org/wiki/Main_Page', open('wikipedia.out', 'w'), 2) + loop(timeout=5.0) + + +pycurl.global_init(pycurl.GLOBAL_DEFAULT) +try: + multi_dispatcher +except NameError: + multi_dispatcher = curl_multi_dispatcher() + +if __name__ == '__main__': + test() diff --git a/src/libwebvi/webvi/constants.py b/src/libwebvi/webvi/constants.py new file mode 100644 index 0000000..2797178 --- /dev/null +++ b/src/libwebvi/webvi/constants.py @@ -0,0 +1,50 @@ +# constants.py - Python definitions for constants in libwebvi.h +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# Keep these in sync with libwebvi.h + +class WebviRequestType: + MENU = 0 + FILE = 1 + STREAMURL = 2 + +class WebviErr: + OK = 0 + INVALID_HANDLE = 1 + INVALID_PARAMETER = 2 + INTERNAL_ERROR = -1 + +class WebviOpt: + WRITEFUNC = 0 + READFUNC = 1 + WRITEDATA = 2 + READDATA = 3 + +class WebviInfo: + URL = 0 + CONTENT_LENGTH = 1 + CONTENT_TYPE = 2 + STREAM_TITLE = 3 + +class WebviSelectBitmask: + TIMEOUT = 0 + READ = 1 + WRITE = 2 + EXCEPTION = 4 + +class WebviConfig: + TEMPLATE_PATH = 0 diff --git a/src/libwebvi/webvi/download.py b/src/libwebvi/webvi/download.py new file mode 100644 index 0000000..34240ff --- /dev/null +++ b/src/libwebvi/webvi/download.py @@ -0,0 +1,470 @@ +# download.py - webvi downloader backend +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import os +import asyncore +import asynchat +import cStringIO +import urllib +import subprocess +import socket +import signal +import pycurl +import asyncurl +import utils +import version + +WEBVID_USER_AGENT = 'libwebvi/%s %s' % (version.VERSION, pycurl.version) +MOZILLA_USER_AGENT = 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5' + +try: + from libmimms import libmms +except ImportError, e: + pass + +# Mapping from curl error codes to webvi errors. The error constants +# are defined only in pycurl 7.16.1 and newer. +if pycurl.version_info()[2] >= 0x071001: + CURL_ERROR_CODE_MAPPING = \ + {pycurl.E_OK: 0, + pycurl.E_OPERATION_TIMEOUTED: 408, + pycurl.E_OUT_OF_MEMORY: 500, + pycurl.E_PARTIAL_FILE: 504, + pycurl.E_READ_ERROR: 504, + pycurl.E_RECV_ERROR: 504, + pycurl.E_REMOTE_FILE_NOT_FOUND: 404, + pycurl.E_TOO_MANY_REDIRECTS: 404, + pycurl.E_UNSUPPORTED_PROTOCOL: 500, + pycurl.E_URL_MALFORMAT: 400, + pycurl.E_COULDNT_CONNECT: 506, + pycurl.E_COULDNT_RESOLVE_HOST: 506, + pycurl.E_COULDNT_RESOLVE_PROXY: 506, + pycurl.E_FILE_COULDNT_READ_FILE: 404, + pycurl.E_GOT_NOTHING: 504, + pycurl.E_HTTP_RETURNED_ERROR: 404, + pycurl.E_INTERFACE_FAILED: 506, + pycurl.E_LOGIN_DENIED: 403} +else: + CURL_ERROR_CODE_MAPPING = {pycurl.E_OK: 0} + +class DownloaderException(Exception): + def __init__(self, errcode, errmsg): + self.code = errcode + self.msg = errmsg + + def __str__(self): + return '%s %s' % (self.code, self.msg) + +def create_downloader(url, templatedir, writefunc=None, headerfunc=None, + donefunc=None, HTTPheaders=None, headers_only=False): + """Downloader factory. + + Returns a suitable downloader object according to url type. Raises + DownloaderException if creating the downloader fails. + """ + if url == '': + return DummyDownloader('', writefunc, headerfunc, donefunc, + headers_only) + + elif url.startswith('mms://') or url.startswith('mmsh://'): + try: + libmms + except (NameError, OSError): + raise DownloaderException(501, 'MMS scheme not supported. Install mimms.') + return MMSDownload(url, writefunc, headerfunc, donefunc, + headers_only) + + elif url.startswith('wvt://'): + executable, parameters = parse_external_downloader_wvt_uri(url, templatedir) + if executable is None: + raise DownloaderException(400, 'Invalid wvt:// URL') + try: + return ExternalDownloader(executable, parameters, writefunc, + headerfunc, donefunc, headers_only) + except OSError, (errno, strerror): + raise DownloaderException(500, 'Failed to execute %s: %s' % + (executable, strerror)) + + else: + return CurlDownload(url, writefunc, headerfunc, donefunc, + HTTPheaders, headers_only) + +def convert_curl_error(err, errmsg, aborted): + """Convert a curl error code err to webvi error code.""" + if err == pycurl.E_WRITE_ERROR: + return (402, 'Aborted') + elif err not in CURL_ERROR_CODE_MAPPING: + return (500, errmsg) + else: + return (CURL_ERROR_CODE_MAPPING[err], errmsg) + +def parse_external_downloader_wvt_uri(url, templatedir): + exe = None + params = [] + if not url.startswith('wvt:///bin/'): + return (exe, params) + + split = url[len('wvt:///bin/'):].split('?', 1) + exe = templatedir + '/bin/' + split[0] + + if len(split) > 1: + params = [urllib.unquote(x) for x in split[1].split('&')] + + return (exe, params) + +def _new_process_group(): + os.setpgid(0, 0) + +class DownloaderBase: + """Base class for downloaders.""" + def __init__(self, url): + self.url = url + + def start(self): + """Should start the download process.""" + pass + + def abort(self): + """Signals that the download should be aborted.""" + pass + + def get_url(self): + """Return the URL where the data was downloaded.""" + return self.url + + def get_body(self): + return '' + + def get_encoding(self): + """Return the encoding of the downloaded object, or None if + encoding is not known.""" + return None + + +class DummyDownloader(DownloaderBase, asyncore.file_dispatcher): + """This class doesn't actually download anything, but returns msg + string as if it had been result of a download operation. + """ + def __init__(self, msg, writefunc=None, headerfunc=None, + donefunc=None, headers_only=False): + DownloaderBase.__init__(self, '') + self.donefunc = donefunc + self.writefunc = writefunc + self.headers_only = headers_only + + readfd, writefd = os.pipe() + asyncore.file_dispatcher.__init__(self, readfd) + os.write(writefd, msg) + os.close(writefd) + + def set_file(self, fd): + # Like asyncore.file_dispatcher.set_file() but doesn't call + # add_channel(). We'll call add_channel() in start() when the + # download shall begin. + self.socket = asyncore.file_wrapper(fd) + self._fileno = self.socket.fileno() + + def start(self): + if self.headers_only: + self.donefunc(0, None) + else: + self.add_channel() + + def readable(self): + return True + + def writable(self): + return False + + def handle_read(self): + try: + data = self.recv(4096) + if data and self.writefunc is not None: + self.writefunc(data) + except socket.error: + self.handle_error() + + def handle_close(self): + self.close() + + if self.donefunc is not None: + self.donefunc(0, '') + + +class CurlDownload(DownloaderBase, asyncurl.async_curl_dispatcher): + """Downloads a large number of different URL schemes using + libcurl.""" + def __init__(self, url, writefunc=None, headerfunc=None, + donefunc=None, HTTPheaders=None, headers_only=False): + DownloaderBase.__init__(self, url) + asyncurl.async_curl_dispatcher.__init__(self, url, False) + self.donefunc = donefunc + self.writefunc = writefunc + self.contenttype = None + self.running = True + self.aborted = False + + self.curl.setopt(pycurl.USERAGENT, WEBVID_USER_AGENT) + if headers_only: + self.curl.setopt(pycurl.NOBODY, 1) + if headerfunc is not None: + self.curl.setopt(pycurl.HEADERFUNCTION, headerfunc) + self.curl.setopt(pycurl.WRITEFUNCTION, self.writewrapper) + + headers = [] + if HTTPheaders is not None: + for headername, headerdata in HTTPheaders.iteritems(): + if headername == 'cookie': + self.curl.setopt(pycurl.COOKIE, headerdata) + else: + headers.append(headername + ': ' + headerdata) + + self.curl.setopt(pycurl.HTTPHEADER, headers) + + def start(self): + self.add_channel() + + def close(self): + self.contenttype = self.curl.getinfo(pycurl.CONTENT_TYPE) + asyncurl.async_curl_dispatcher.close(self) + self.running = False + + def abort(self): + self.aborted = True + + def writewrapper(self, data): + if self.aborted: + return 0 + + if self.writefunc is None: + return self.write_to_buf(data) + else: + return self.writefunc(data) + + def get_body(self): + return self.buffer.getvalue() + + def get_encoding(self): + if self.running: + self.contenttype = self.curl.getinfo(pycurl.CONTENT_TYPE) + + if self.contenttype is None: + return None + + values = self.contenttype.split(';', 1) + if len(values) > 1: + for par in values[1].split(' '): + if par.startswith('charset='): + return par[len('charset='):].strip('"') + + return None + + def handle_read(self): + # Do nothing to the read data here. Instead, let the base + # class to collect the data to self.buffer. + pass + + def handle_completed(self, err, errmsg): + asyncurl.async_curl_dispatcher.handle_completed(self, err, errmsg) + if self.donefunc is not None: + err, errmsg = convert_curl_error(err, errmsg, self.aborted) + self.donefunc(err, errmsg) + + +class MMSDownload(DownloaderBase, asyncore.file_dispatcher): + def __init__(self, url, writefunc=None, headerfunc=None, + donefunc=None, headers_only=False): + DownloaderBase.__init__(self, url) + self.r, self.w = os.pipe() + asyncore.file_dispatcher.__init__(self, self.r) + + self.writefunc = writefunc + self.headerfunc = headerfunc + self.donefunc = donefunc + self.relaylen = -1 + self.expectedlen = -1 + self.headers_only = headers_only + self.stream = None + self.errmsg = None + self.aborted = False + + def set_file(self, fd): + self.socket = asyncore.file_wrapper(fd) + self._fileno = self.socket.fileno() + + def recv(self, buffer_size): + data = self.stream.read() + if not data: + self.handle_close() + return '' + else: + return data + + def close(self): + if self.stream is not None: + self.stream.close() + + os.close(self.w) + asyncore.file_dispatcher.close(self) + + def readable(self): + return self.stream is not None + + def writable(self): + return False + + def start(self): + try: + self.stream = libmms.Stream(self.url, 1000000) + except libmms.Error, e: + self.errmsg = e.message + self.handle_close() + return + + os.write(self.w, '0') # signal that this dispatcher has data available + + if self.headerfunc: + # Output the length in a HTTP-like header field so that we + # can use the same callbacks as with HTTP downloads. + ext = utils.get_url_extension(self.url) + if ext == 'wma': + self.headerfunc('Content-Type: audio/x-ms-wma') + else: # if ext == 'wmv': + self.headerfunc('Content-Type: video/x-ms-wmv') + self.headerfunc('Content-Length: %d' % self.stream.length()) + + if self.headers_only: + self.handle_close() + else: + self.add_channel() + + def abort(self): + self.aborted = True + + def handle_read(self): + if self.aborted: + self.handle_close() + return '' + + try: + data = self.recv(4096) + if data and (self.writefunc is not None): + self.writefunc(data) + except libmms.Error, e: + self.errmsg = e.message + self.handle_close() + return + + def handle_close(self): + self.close() + self.stream = None + + if self.errmsg is not None: + self.donefunc(500, self.errmsg) + elif self.aborted: + self.donefunc(402, 'Aborted') + elif self.relaylen < self.expectedlen: + # We got fewer bytes than expected. Maybe the connection + # was lost? + self.donefunc(504, 'Download may be incomplete (length %d < %d)' % + (self.relaylen, self.expectedlen)) + else: + self.donefunc(0, '') + + +class ExternalDownloader(DownloaderBase, asyncore.file_dispatcher): + """Executes an external process and reads its result on standard + output.""" + def __init__(self, executable, parameters, writefunc=None, + headerfunc=None, donefunc=None, headers_only=False): + DownloaderBase.__init__(self, '') + asyncore.dispatcher.__init__(self, None, None) + self.executable = executable + self.writefunc = writefunc + self.headerfunc = headerfunc + self.donefunc = donefunc + self.headers_only = headers_only + self.contenttype = '' + self.aborted = False + + args = [] + for par in parameters: + try: + key, val = par.split('=', 1) + if key == 'contenttype': + self.contenttype = val + elif key == 'arg': + args.append(val) + except ValueError: + pass + + if args: + self.url = args[0] + else: + self.url = executable + self.cmd = [executable] + args + + self.process = None + + def start(self): + self.headerfunc('Content-Type: ' + self.contenttype) + + if self.headers_only: + self.donefunc(0, None) + return + + self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, + close_fds=True, + preexec_fn=_new_process_group) + asyncore.file_dispatcher.__init__(self, os.dup(self.process.stdout.fileno())) + + def abort(self): + if self.process is not None: + self.aborted = True + pg = os.getpgid(self.process.pid) + os.killpg(pg, signal.SIGTERM) + + def readable(self): + # Return True if the subprocess is still alive + return self.process is not None and self.process.returncode is None + + def writable(self): + return False + + def handle_read(self): + try: + data = self.recv(4096) + if data and self.writefunc is not None: + self.writefunc(data) + except socket.error: + self.handle_error() + return + + def handle_close(self): + self.close() + self.process.wait() + + if self.donefunc is not None: + if self.process.returncode == 0: + self.donefunc(0, '') + elif self.aborted and self.process.returncode == -signal.SIGTERM: + self.donefunc(402, 'Aborted') + else: + self.donefunc(500, 'Child process "%s" returned error %s' % \ + (' '.join(self.cmd), str(self.process.returncode))) + + self.process = None diff --git a/src/libwebvi/webvi/json2xml.py b/src/libwebvi/webvi/json2xml.py new file mode 100644 index 0000000..372e6c6 --- /dev/null +++ b/src/libwebvi/webvi/json2xml.py @@ -0,0 +1,69 @@ +import sys +import libxml2 + +try: + import json +except ImportError: + try: + import simplejson as json + except ImportError: + print 'Error: install simplejson' + raise + +def _serialize_to_xml(obj, xmlnode): + """Create XML representation of a Python object (list, tuple, + dist, or basic number and string types).""" + if type(obj) in (list, tuple): + listnode = libxml2.newNode('list') + for li in obj: + itemnode = libxml2.newNode('li') + _serialize_to_xml(li, itemnode) + listnode.addChild(itemnode) + xmlnode.addChild(listnode) + + elif type(obj) == dict: + dictnode = libxml2.newNode('dict') + for key, val in obj.iteritems(): + itemnode = libxml2.newNode(key.encode('utf-8')) + _serialize_to_xml(val, itemnode) + dictnode.addChild(itemnode) + xmlnode.addChild(dictnode) + + elif type(obj) in (str, unicode, int, long, float, complex, bool): + content = libxml2.newText(unicode(obj).encode('utf-8')) + xmlnode.addChild(content) + + elif type(obj) == type(None): + pass + + else: + raise TypeError('Unsupported type %s while serializing to xml' + % type(obj)) + +def json2xml(jsonstr, encoding=None): + """Convert JSON string jsonstr to XML tree.""" + try: + parsed = json.loads(jsonstr, encoding) + except ValueError: + return None + + xmldoc = libxml2.newDoc("1.0") + root = libxml2.newNode("jsondocument") + xmldoc.setRootElement(root) + + _serialize_to_xml(parsed, root) + + return xmldoc + +def test(): + xml = json2xml(open(sys.argv[1]).read()) + + if xml is None: + return + + print xml.serialize('utf-8') + + xml.freeDoc() + +if __name__ == '__main__': + test() diff --git a/src/libwebvi/webvi/request.py b/src/libwebvi/webvi/request.py new file mode 100644 index 0000000..e19eb9c --- /dev/null +++ b/src/libwebvi/webvi/request.py @@ -0,0 +1,617 @@ +# request.py - webvi request class +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import urllib +import libxml2 +import os.path +import cStringIO +import re +import download +import sys +import utils +import json2xml +from constants import WebviRequestType + +DEBUG = False + +DEFAULT_TEMPLATE_PATH = '/usr/local/share/webvi/templates' +template_path = DEFAULT_TEMPLATE_PATH + +def debug(msg): + if DEBUG: + if type(msg) == unicode: + sys.stderr.write(msg.encode('ascii', 'replace')) + else: + sys.stderr.write(msg) + sys.stderr.write('\n') + +def set_template_path(path): + global template_path + + if path is None: + template_path = os.path.realpath(DEFAULT_TEMPLATE_PATH) + else: + template_path = os.path.realpath(path) + + debug("set_template_path " + template_path) + +def parse_reference(reference): + """Parses URLs of the following form: + + wvt:///youtube/video.xsl?srcurl=http%3A%2F%2Fwww.youtube.com%2F¶m=name1,value1¶m=name2,value2 + + reference is assumed to be URL-encoded UTF-8 string. + + Returns (template, srcurl, params, processing_instructions) where + template if the URL path name (the part before ?), srcurl is the + parameter called srcurl, and params is a dictionary of (name, + quoted-value) pairs extracted from param parameters. Parameter + values are quoted so that the xslt parser handles them as string. + processing_instructions is dictionary of options that affect the + further processing of the data. + """ + try: + reference = str(reference) + except UnicodeEncodeError: + return (None, None, None, None) + + if not reference.startswith('wvt:///'): + return (None, None, None, None) + + ref = reference[len('wvt:///'):] + + template = None + srcurl = '' + parameters = {} + substitutions = {} + refsettings = {'HTTP-headers': {}} + + fields = ref.split('?', 1) + template = fields[0] + if len(fields) == 1: + return (template, srcurl, parameters, refsettings) + + for par in fields[1].split('&'): + paramfields = par.split('=', 1) + key = paramfields[0] + + if len(paramfields) == 2: + value = urllib.unquote(paramfields[1]) + else: + value = '' + + if key.lower() == 'srcurl': + srcurl = value + + elif key.lower() == 'param': + fields2 = value.split(',', 1) + pname = fields2[0].lower() + if len(fields2) == 2: + pvalue = "'" + fields2[1] + "'" + else: + pvalue = "''" + parameters[pname] = pvalue + + elif key.lower() == 'subst': + substfields = value.split(',', 1) + if len(substfields) == 2: + substitutions[substfields[0]] = substfields[1] + + elif key.lower() == 'minquality': + try: + refsettings['minquality'] = int(value) + except ValueError: + pass + + elif key.lower() == 'maxquality': + try: + refsettings['maxquality'] = int(value) + except ValueError: + pass + + elif key.lower() == 'postprocess': + refsettings.setdefault('postprocess', []).append(value) + + elif key.lower() == 'contenttype': + refsettings['overridecontenttype'] = value + + elif key.lower() == 'http-header': + try: + headername, headerdata = value.split(',', 1) + except ValueError: + continue + refsettings['HTTP-headers'][headername] = headerdata + + if substitutions: + srcurl = brace_substitution(srcurl, substitutions) + + return (template, srcurl, parameters, refsettings) + +def brace_substitution(template, subs): + """Substitute subs[x] for '{x}' in template. Unescape {{ to { and + }} to }. Unescaping is not done in substitution keys, i.e. while + scanning for a closing brace after a single opening brace.""" + strbuf = cStringIO.StringIO() + + last_pos = 0 + for match in re.finditer(r'{{?|}}', template): + next_pos = match.start() + if next_pos < last_pos: + continue + + strbuf.write(template[last_pos:next_pos]) + if match.group(0) == '{{': + strbuf.write('{') + last_pos = next_pos+2 + + elif match.group(0) == '}}': + strbuf.write('}') + last_pos = next_pos+2 + + else: # match.group(0) == '{' + key_end = template.find('}', next_pos+1) + if key_end == -1: + strbuf.write(template[next_pos:]) + last_pos = len(template) + break + + try: + strbuf.write(urllib.quote(subs[template[next_pos+1:key_end]])) + except KeyError: + strbuf.write(template[next_pos:key_end+1]) + last_pos = key_end+1 + + strbuf.write(template[last_pos:]) + return strbuf.getvalue() + + +class Request: + DEFAULT_URL_PRIORITY = 50 + + def __init__(self, reference, reqtype): + self.handle = None + self.dl = None + + # state variables + self.xsltfile, self.srcurl, self.xsltparameters, self.processing = \ + parse_reference(reference) + self.type = reqtype + self.status = -1 + self.errmsg = None + self.mediaurls = [] + + # stream information + self.contenttype = 'text/xml' + self.contentlength = -1 + self.streamtitle = '' + + # callbacks + self.writefunc = None + self.writedata = None + self.readfunc = None + self.readdata = None + + def handle_header(self, buf): + namedata = buf.split(':', 1) + if len(namedata) == 2: + headername, headerdata = namedata + if headername.lower() == 'content-type': + # Strip parameters like charset="utf-8" + self.contenttype = headerdata.split(';', 1)[0].strip() + elif headername.lower() == 'content-length': + try: + self.contentlength = int(headerdata.strip()) + except ValueError: + self.contentlength = -1 + + def setup_downloader(self, url, writefunc, headerfunc, donefunc, + HTTPheaders=None, headers_only=False): + try: + self.dl = download.create_downloader(url, + template_path, + writefunc, + headerfunc, + donefunc, + HTTPheaders, + headers_only) + self.dl.start() + except download.DownloaderException, exc: + self.dl = None + if donefunc is not None: + donefunc(exc.code, exc.msg) + + def start(self): + debug('start %s\ntemplate = %s, type = %s\n' + 'parameters = %s, processing = %s' % + (self.srcurl, self.xsltfile, self.type, str(self.xsltparameters), + str(self.processing))) + + if self.type == WebviRequestType.MENU and self.srcurl == 'mainmenu': + self.send_mainmenu() + else: + self.setup_downloader(self.srcurl, None, + self.handle_header, + self.finished_apply_xslt, + self.processing['HTTP-headers']) + + def stop(self): + if self.dl is not None: + debug("aborting") + self.dl.abort() + + def start_download(self, url=None): + """Initialize a download. + + If url is None, pop the first URL out of self.mediaurls. If + URL is an ASX playlist, read the content URL from it and start + to download the actual content. + """ + while url is None or url == '': + try: + url = self.mediaurls.pop(0) + except IndexError: + self.request_done(406, 'No more URLs left') + + debug('Start_download ' + url) + + # reset stream status + self.contenttype = 'text/xml' + self.contentlength = -1 + + if self.is_asx_playlist(url): + self.setup_downloader(url, None, + self.handle_header, + self.finished_playlist_loaded, + self.processing['HTTP-headers']) + + else: + self.setup_downloader(url, self.writewrapper, + self.handle_header, + self.finished_download, + self.processing['HTTP-headers']) + + def check_and_send_url(self, url=None): + """Check if the target exists (currently only for HTTP URLs) + before relaying the URL to the client.""" + while url is None or url == '': + try: + url = self.mediaurls.pop(0) + except IndexError: + self.request_done(406, 'No more URLs left') + return + + debug('check_and_send_url ' + str(url)) + + if self.is_asx_playlist(url): + self.setup_downloader(url, None, self.handle_header, + self.finished_playlist_loaded, + self.processing['HTTP-headers']) + elif url.startswith('http://') or url.startswith('https://'): + self.checking_url = url + self.setup_downloader(url, None, None, + self.finished_check_url, + self.processing['HTTP-headers'], True) + else: + self.writewrapper(url) + self.request_done(0, None) + + def send_mainmenu(self): + """Build the XML main menu from the module description files + in the hard drive. + """ + if not os.path.isdir(template_path): + self.request_done(404, "Can't access service directory %s" % + template_path) + return + + debug('Reading XSLT templates from ' + template_path) + + # Find menu items in the service.xml files in the subdirectories + menuitems = {} + for f in os.listdir(template_path): + if f == 'bin': + continue + + filename = os.path.join(template_path, f, 'service.xml') + try: + doc = libxml2.parseFile(filename) + except libxml2.parserError: + debug("Failed to parse " + filename); + continue + + title = '' + url = '' + + root = doc.getRootElement() + if (root is None) or (root.name != 'service'): + debug("Root node is not 'service' in " + filename); + doc.freeDoc() + continue + node = root.children + while node is not None: + if node.name == 'title': + title = utils.get_content_unicode(node) + elif node.name == 'ref': + url = utils.get_content_unicode(node) + node = node.next + doc.freeDoc() + + if (title == '') or (url == ''): + debug("Empty <title> or <ref> in " + filename); + continue + + menuitems[title.lower()] = ('<link>\n' + '<label>%s</label>\n' + '<ref>%s</ref>\n' + '</link>\n' % + (libxml2.newText(title), + libxml2.newText(url))) + # Sort the menu items + titles = menuitems.keys() + titles.sort() + + # Build the menu + mainmenu = ('<?xml version="1.0"?>\n' + '<wvmenu>\n' + '<title>Select video source</title>\n') + for t in titles: + mainmenu += menuitems[t] + mainmenu += '</wvmenu>' + + self.dl = download.DummyDownloader(mainmenu, + writefunc=self.writewrapper, + donefunc=self.request_done) + self.dl.start() + + def writewrapper(self, inp): + """Wraps pycurl write callback (with the data as the only + parameter) into webvi write callback (with signature (data, + length, usertag)). If self.writefunc is not set, write to + stdout.""" + if self.writefunc is not None: + inplen = len(inp) + written = self.writefunc(inp, inplen, self.writedata) + if written != inplen: + self.dl.close() + self.request_done(405, 'Write callback failed') + else: + sys.stdout.write(inp) + + def is_asx_playlist(self, url): + if utils.get_url_extension(url).lower() == 'asx': + return True + else: + return False + + def get_url_from_asx(self, asx, asxurl): + """Simple ASX parser. Return the content of the first <ref> + tag.""" + try: + doc = libxml2.htmlReadDoc(asx, asxurl, None, + libxml2.HTML_PARSE_NOERROR | + libxml2.HTML_PARSE_NOWARNING | + libxml2.HTML_PARSE_NONET) + except libxml2.treeError: + debug('Can\'t parse ASX:\n' + asx) + return None + root = doc.getRootElement() + ret = self._get_ref_recursive(root).strip() + doc.freeDoc() + return ret + + def _get_ref_recursive(self, node): + if node is None: + return None + if node.name.lower() == 'ref': + href = node.prop('href') + if href is not None: + return href + child = node.children + while child: + res = self._get_ref_recursive(child) + if res is not None: + return res + child = child.next + return None + + def parse_mediaurl(self, xml, minpriority, maxpriority): + debug('parse_mediaurl\n' + xml) + + self.streamtitle = '???' + mediaurls = [] + + try: + doc = libxml2.parseDoc(xml) + except libxml2.parserError: + debug('Invalid XML') + return mediaurls + + root = doc.getRootElement() + if root is None: + debug('No root node') + return mediaurls + + urls_and_priorities = [] + node = root.children + while node: + if node.name == 'title': + self.streamtitle = utils.get_content_unicode(node) + elif node.name == 'url': + try: + priority = int(node.prop('priority')) + except (ValueError, TypeError): + priority = self.DEFAULT_URL_PRIORITY + + content = node.getContent() + if priority >= minpriority and priority <= maxpriority and content != '': + urls_and_priorities.append((priority, content)) + node = node.next + doc.freeDoc() + + urls_and_priorities.sort() + urls_and_priorities.reverse() + mediaurls = [b[1] for b in urls_and_priorities] + + return mediaurls + + def finished_download(self, err, errmsg): + if err == 0: + self.request_done(0, None) + elif err != 402 and self.mediaurls: + debug('Download failed (%s %s).\nTrying the next one.' % (err, errmsg)) + self.dl = None + self.start_download() + else: + self.request_done(err, errmsg) + + def finished_playlist_loaded(self, err, errmsg): + if err == 0: + url = self.get_url_from_asx(self.dl.get_body(), + self.dl.get_url()) + if url is None: + err = 404 + errmsg = 'No ref tag in ASX file' + else: + if not self.is_asx_playlist(url) and url.startswith('http:'): + # The protocol is really "Windows Media HTTP + # Streaming Protocol", not plain HTTP, even though + # the scheme in the ASX file says "http://". We + # can't do MS-WMSP but luckily most MS-WMSP + # servers support MMS, too. + url = 'mms:' + url[5:] + + if self.type == WebviRequestType.STREAMURL: + self.check_and_send_url(url) + else: + self.start_download(url) + + if err != 0: + if not self.mediaurls: + self.request_done(err, errmsg) + else: + if self.type == WebviRequestType.STREAMURL: + self.check_and_send_url() + else: + self.start_download() + + def finished_apply_xslt(self, err, errmsg): + if err != 0: + self.request_done(err, errmsg) + return + + url = self.srcurl + + # Add input documentURL to the parameters + params = self.xsltparameters.copy() + params['docurl'] = "'" + url + "'" + + minpriority = self.processing.get('minquality', 0) + maxpriority = self.processing.get('maxquality', 100) + + xsltpath = os.path.join(template_path, self.xsltfile) + + # Check that xsltpath is inside the template directory + if os.path.commonprefix([template_path, os.path.realpath(xsltpath)]) != template_path: + self.request_done(503, 'Insecure template path') + return + + xml = self.dl.get_body() + encoding = self.dl.get_encoding() + + if self.processing.has_key('postprocess') and \ + 'json2xml' in self.processing['postprocess']: + xmldoc = json2xml.json2xml(xml, encoding) + if xmldoc is None: + self.request_done(503, 'Invalid JSON content') + return + xml = xmldoc.serialize('utf-8') + encoding = 'utf-8' + + #debug(xml) + + resulttree = utils.apply_xslt(xml, encoding, url, + xsltpath, params) + if resulttree is None: + self.request_done(503, 'XSLT transformation failed') + return + + if self.type == WebviRequestType.MENU: + debug("result:") + debug(resulttree) + self.writewrapper(resulttree) + self.request_done(0, None) + elif self.type == WebviRequestType.STREAMURL: + self.mediaurls = self.parse_mediaurl(resulttree, minpriority, maxpriority) + if self.mediaurls: + self.check_and_send_url() + else: + self.request_done(406, 'No valid URLs found') + elif self.type == WebviRequestType.FILE: + self.mediaurls = self.parse_mediaurl(resulttree, minpriority, maxpriority) + if self.mediaurls: + self.start_download() + else: + self.request_done(406, 'No valid URLs found') + else: + self.request_done(0, None) + + def finished_extract_playlist_url(self, err, errmsg): + if err == 0: + url = self.get_url_from_asx(self.dl.get_body(), + self.dl.get_url()) + if url is not None: + if self.is_asx_playlist(url): + self.setup_downloader(url, None, None, + self.finished_extract_playlist_url, + self.processing['HTTP-headers']) + else: + if url.startswith('http:'): + url = 'mms:' + url[5:] + self.check_and_send_url(url) + else: + self.request_done(503, 'XSLT tranformation failed to produce URL') + else: + self.request_done(err, errmsg) + + + def finished_check_url(self, err, errmsg): + if err == 0: + self.writewrapper(self.checking_url) + self.request_done(0, None) + else: + self.check_and_send_url() + + def request_done(self, err, errmsg): + debug('request_done: %d %s' % (err, errmsg)) + + self.status = err + self.errmsg = errmsg + self.dl = None + + def is_finished(self): + return self.status >= 0 + + +class RequestList(dict): + nextreqnum = 1 + + def put(self, req): + reqnum = RequestList.nextreqnum + RequestList.nextreqnum += 1 + req.handle = reqnum + self[reqnum] = req + return reqnum diff --git a/src/libwebvi/webvi/utils.py b/src/libwebvi/webvi/utils.py new file mode 100644 index 0000000..cefe09a --- /dev/null +++ b/src/libwebvi/webvi/utils.py @@ -0,0 +1,134 @@ +# utils.py - misc. utility functions +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import urlparse +import re +import libxml2 +import libxslt +import urllib + +def get_url_extension(url): + """Extracts and returns the file extension from a URL.""" + # The extension is located right before possible query + # ("?query=foo") or fragment ("#bar"). + try: + i = url.index('?') + url = url[:i] + except ValueError: + pass + # The extension is the part after the last '.' that does not + # contain '/'. + idot = url.rfind('.') + islash = url.rfind('/') + if idot > islash: + return url[idot+1:] + else: + return '' + +def urljoin_query_fix(base, url, allow_fragments=True): + """urlparse.urljoin in Python 2.5 (2.6?) and older is broken in + case url is a pure query. See http://bugs.python.org/issue1432. + This handles correctly the case where base is a full (http) url + and url is a query, and calls urljoin() for other cases.""" + if url.startswith('?'): + bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ + urlparse.urlparse(base, '', allow_fragments) + bquery = url[1:] + return urlparse.urlunparse((bscheme, bnetloc, bpath, + bparams, bquery, bfragment)) + else: + return urlparse.urljoin(base, url, allow_fragments) + +def get_content_unicode(node): + """node.getContent() returns an UTF-8 encoded sequence of bytes (a + string). Convert it to a unicode object.""" + return unicode(node.getContent(), 'UTF-8', 'replace') + +def apply_xslt(buf, encoding, url, xsltfile, params=None): + """Apply xslt transform from file xsltfile to the string buf + with parameters params. url is the location of buf. Returns + the transformed file as a string, or None if the + transformation couldn't be completed.""" + stylesheet = libxslt.parseStylesheetFile(xsltfile) + + if stylesheet is None: + #self.log_info('Can\'t open stylesheet %s' % xsltfile, 'warning') + return None + try: + # htmlReadDoc fails if the buffer is empty but succeeds + # (returning an empty tree) if the buffer is a single + # space. + if buf == '': + buf = ' ' + + # Guess whether this is an XML or HTML document. + if buf.startswith('<?xml'): + doc = libxml2.readDoc(buf, url, None, + libxml2.XML_PARSE_NOERROR | + libxml2.XML_PARSE_NOWARNING | + libxml2.XML_PARSE_NONET) + else: + #self.log_info('Using HTML parser', 'debug') + doc = libxml2.htmlReadDoc(buf, url, encoding, + libxml2.HTML_PARSE_NOERROR | + libxml2.HTML_PARSE_NOWARNING | + libxml2.HTML_PARSE_NONET) + except libxml2.treeError: + stylesheet.freeStylesheet() + #self.log_info('Can\'t parse XML document', 'warning') + return None + resultdoc = stylesheet.applyStylesheet(doc, params) + stylesheet.freeStylesheet() + doc.freeDoc() + if resultdoc is None: + #self.log_info('Can\'t apply stylesheet', 'warning') + return None + + # Postprocess the document: + # Resolve relative URLs in srcurl (TODO: this should be done in XSLT) + root = resultdoc.getRootElement() + if root is None: + resultdoc.freeDoc() + return None + + node2 = root.children + while node2 is not None: + if node2.name not in ['link', 'button']: + node2 = node2.next + continue + + node = node2.children + while node is not None: + if (node.name == 'ref') or (node.name == 'stream') or \ + (node.name == 'submission'): + refurl = node.getContent() + + match = re.search(r'\?.*srcurl=([^&]*)', refurl) + if match is not None: + oldurl = urllib.unquote(match.group(1)) + absurl = urljoin_query_fix(url, oldurl) + newurl = refurl[:match.start(1)] + \ + urllib.quote(absurl) + \ + refurl[match.end(1):] + node.setContent(resultdoc.encodeSpecialChars(newurl)) + + node = node.next + node2 = node2.next + + ret = resultdoc.serialize('UTF-8') + resultdoc.freeDoc() + return ret diff --git a/src/libwebvi/webvi/version.py b/src/libwebvi/webvi/version.py new file mode 100644 index 0000000..26cb817 --- /dev/null +++ b/src/libwebvi/webvi/version.py @@ -0,0 +1,20 @@ +# version.py - webvi version +# +# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +MAJOR = '0' +MINOR = '2' +VERSION = MAJOR + '.' + MINOR |