src/libwebvi/webvi/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

# utils.py - misc. utility functions
#
# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import urlparse
import re
import libxml2
import libxslt
import urllib

def get_url_extension(url):
    """Extracts and returns the file extension from a URL."""
    # The extension is located right before possible query
    # ("?query=foo") or fragment ("#bar").
    try:
        i = url.index('?')
        url = url[:i]
    except ValueError:
        pass
    # The extension is the part after the last '.' that does not
    # contain '/'.
    idot = url.rfind('.')
    islash = url.rfind('/')
    if idot > islash:
        return url[idot+1:]
    else:
        return ''

def urljoin_query_fix(base, url, allow_fragments=True):
    """urlparse.urljoin in Python 2.5 (2.6?) and older is broken in
    case url is a pure query. See http://bugs.python.org/issue1432.
    This handles correctly the case where base is a full (http) url
    and url is a query, and calls urljoin() for other cases."""
    if url.startswith('?'):
        bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
            urlparse.urlparse(base, '', allow_fragments)
        bquery = url[1:]
        return urlparse.urlunparse((bscheme, bnetloc, bpath, 
                                    bparams, bquery, bfragment))
    else:
        return urlparse.urljoin(base, url, allow_fragments)

def get_content_unicode(node):
    """node.getContent() returns an UTF-8 encoded sequence of bytes (a
    string). Convert it to a unicode object."""
    return unicode(node.getContent(), 'UTF-8', 'replace')

def apply_xslt(buf, encoding, url, xsltfile, params=None):
    """Apply xslt transform from file xsltfile to the string buf
    with parameters params. url is the location of buf. Returns
    the transformed file as a string, or None if the
    transformation couldn't be completed."""
    stylesheet = libxslt.parseStylesheetFile(xsltfile)

    if stylesheet is None:
        #self.log_info('Can\'t open stylesheet %s' % xsltfile, 'warning')
        return None
    try:
        # htmlReadDoc fails if the buffer is empty but succeeds
        # (returning an empty tree) if the buffer is a single
        # space.
        if buf == '':
            buf = ' '

        # Guess whether this is an XML or HTML document.
        if buf.startswith('<?xml'):
            doc = libxml2.readDoc(buf, url, None,
                                  libxml2.XML_PARSE_NOERROR |
                                  libxml2.XML_PARSE_NOWARNING |
                                  libxml2.XML_PARSE_NONET)
        else:
            #self.log_info('Using HTML parser', 'debug')
            doc = libxml2.htmlReadDoc(buf, url, encoding,
                                      libxml2.HTML_PARSE_NOERROR |
                                      libxml2.HTML_PARSE_NOWARNING |
                                      libxml2.HTML_PARSE_NONET)
    except libxml2.treeError:
        stylesheet.freeStylesheet()
        #self.log_info('Can\'t parse XML document', 'warning')
        return None
    resultdoc = stylesheet.applyStylesheet(doc, params)
    stylesheet.freeStylesheet()
    doc.freeDoc()
    if resultdoc is None:
        #self.log_info('Can\'t apply stylesheet', 'warning')
        return None

    # Postprocess the document:
    # Resolve relative URLs in srcurl (TODO: this should be done in XSLT)
    root = resultdoc.getRootElement()
    if root is None:
        resultdoc.freeDoc()
        return None

    node2 = root.children
    while node2 is not None:
        if node2.name not in ['link', 'button']:
            node2 = node2.next
            continue

        node = node2.children
        while node is not None:
            if (node.name == 'ref') or (node.name == 'stream') or \
                    (node.name == 'submission'):
                refurl = node.getContent()

                match = re.search(r'\?.*srcurl=([^&]*)', refurl)
                if match is not None:
                    oldurl = urllib.unquote(match.group(1))
                    absurl = urljoin_query_fix(url, oldurl)
                    newurl = refurl[:match.start(1)] + \
                        urllib.quote(absurl) + \
                        refurl[match.end(1):]
                    node.setContent(resultdoc.encodeSpecialChars(newurl))

            node = node.next
        node2 = node2.next

    ret = resultdoc.serialize('UTF-8')
    resultdoc.freeDoc()
    return ret