1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
# utils.py - misc. utility functions
#
# Copyright (c) 2009, 2010 Antti Ajanki <antti.ajanki@iki.fi>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import urlparse
import re
import libxml2
import libxslt
import urllib
def get_url_extension(url):
"""Extracts and returns the file extension from a URL."""
# The extension is located right before possible query
# ("?query=foo") or fragment ("#bar").
try:
i = url.index('?')
url = url[:i]
except ValueError:
pass
# The extension is the part after the last '.' that does not
# contain '/'.
idot = url.rfind('.')
islash = url.rfind('/')
if idot > islash:
return url[idot+1:]
else:
return ''
def urljoin_query_fix(base, url, allow_fragments=True):
"""urlparse.urljoin in Python 2.5 (2.6?) and older is broken in
case url is a pure query. See http://bugs.python.org/issue1432.
This handles correctly the case where base is a full (http) url
and url is a query, and calls urljoin() for other cases."""
if url.startswith('?'):
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
urlparse.urlparse(base, '', allow_fragments)
bquery = url[1:]
return urlparse.urlunparse((bscheme, bnetloc, bpath,
bparams, bquery, bfragment))
else:
return urlparse.urljoin(base, url, allow_fragments)
def get_content_unicode(node):
"""node.getContent() returns an UTF-8 encoded sequence of bytes (a
string). Convert it to a unicode object."""
return unicode(node.getContent(), 'UTF-8', 'replace')
def apply_xslt(buf, encoding, url, xsltfile, params=None):
"""Apply xslt transform from file xsltfile to the string buf
with parameters params. url is the location of buf. Returns
the transformed file as a string, or None if the
transformation couldn't be completed."""
stylesheet = libxslt.parseStylesheetFile(xsltfile)
if stylesheet is None:
#self.log_info('Can\'t open stylesheet %s' % xsltfile, 'warning')
return None
try:
# htmlReadDoc fails if the buffer is empty but succeeds
# (returning an empty tree) if the buffer is a single
# space.
if buf == '':
buf = ' '
# Guess whether this is an XML or HTML document.
if buf.startswith('<?xml'):
doc = libxml2.readDoc(buf, url, None,
libxml2.XML_PARSE_NOERROR |
libxml2.XML_PARSE_NOWARNING |
libxml2.XML_PARSE_NONET)
else:
#self.log_info('Using HTML parser', 'debug')
doc = libxml2.htmlReadDoc(buf, url, encoding,
libxml2.HTML_PARSE_NOERROR |
libxml2.HTML_PARSE_NOWARNING |
libxml2.HTML_PARSE_NONET)
except libxml2.treeError:
stylesheet.freeStylesheet()
#self.log_info('Can\'t parse XML document', 'warning')
return None
resultdoc = stylesheet.applyStylesheet(doc, params)
stylesheet.freeStylesheet()
doc.freeDoc()
if resultdoc is None:
#self.log_info('Can\'t apply stylesheet', 'warning')
return None
# Postprocess the document:
# Resolve relative URLs in srcurl (TODO: this should be done in XSLT)
root = resultdoc.getRootElement()
if root is None:
resultdoc.freeDoc()
return None
node2 = root.children
while node2 is not None:
if node2.name not in ['link', 'button']:
node2 = node2.next
continue
node = node2.children
while node is not None:
if (node.name == 'ref') or (node.name == 'stream') or \
(node.name == 'submission'):
refurl = node.getContent()
match = re.search(r'\?.*srcurl=([^&]*)', refurl)
if match is not None:
oldurl = urllib.unquote(match.group(1))
absurl = urljoin_query_fix(url, oldurl)
newurl = refurl[:match.start(1)] + \
urllib.quote(absurl) + \
refurl[match.end(1):]
node.setContent(resultdoc.encodeSpecialChars(newurl))
node = node.next
node2 = node2.next
ret = resultdoc.serialize('UTF-8')
resultdoc.freeDoc()
return ret
|