rewrote the shell script muggle_getlyrics in python and

talk directly to the googlyrics2 python code
author: Wolfgang Rohdewald <wolfgang@rohdewald.de> 2009-01-11 18:49:32 +0100
committer: Wolfgang Rohdewald <wolfgang@rohdewald.de> 2009-01-11 18:49:32 +0100
commit: 45ccb381fdc28e4a33f8494d6352b64963d30c7b (patch)
tree: 994efa4ac06959a08d0b7c07f3556af9980f5a14
parent: 343e1dfc3d31e49f74c3d006e3bc4a882c1aa7a7 (diff)
download: vdr-plugin-muggle-45ccb381fdc28e4a33f8494d6352b64963d30c7b.tar.gz
vdr-plugin-muggle-45ccb381fdc28e4a33f8494d6352b64963d30c7b.tar.bz2
4 files changed, 131 insertions, 125 deletions
diff --git a/HISTORY b/HISTORY
index 9bc5662..fd5fadc 100644
--- a/HISTORY
+++ b/HISTORY
@@ -385,3 +385,5 @@ Balke.
 - Lyrics now work with Googlyrics2
 - if several versions of lyrics for a song are found, it is now possible
   to choose among them. 
+- Lyrics: rewrite the shell script muggle_getlyrics in python and talk 
+  directly to the Googlyrics2 python code
diff --git a/README b/README
index 89375a8..966a721 100644
--- a/README
+++ b/README
@@ -57,7 +57,9 @@ required:
     http://www.xiph.org/ogg/vorbis/)
  - optionally libFLAC++ to replay FLAC files
    (Debian package libflac++-dev or sources from flac.sourceforge.net)
- - recode, python and Googlyrics2 if you want to download lyrics
+ - recode, python, python-chardet and Googlyrics2 if you want to download lyrics.
+   Googlyrics2 must be newer than Beta3, otherwise umlauts will be
+   wrong
 
 The developer versions are needed because their headers are required
 for compilation.  The server need not be on the same machine as the
diff --git a/scripts/mgLyric.py b/scripts/mgLyric.py
deleted file mode 100755
index 97042f5..0000000
--- a/scripts/mgLyric.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-import os, sys, locale
-
-title = sys.argv[1]
-artist = sys.argv[2]
-outdir = sys.argv[3]
-
-# add other possible paths here:
-googlFound = False
-for scriptdir in ('/usr/share/apps/amarok/scripts/Googlyrics2', \
-                'NULL'):
-        if os.path.isdir(scriptdir+'/sites/'):
-                sys.path.append(scriptdir + "/lib/")
-                sys.path.append(scriptdir + "/sites/")
-                os.chdir(scriptdir)
-                googlFound = True
-                break 
-
-if not googlFound:
-	outfile = open(outdir + '/1.raw',"w")
-	outfile.write("Googlyrics2 is not installed\nSee http://quicode.com/googlyircs2")
-	outfile.close
-	sys.exit(0)
-
-Debugging = False
-
-if Debugging:
-	outlyric=["Version 1","Version 2","Version 3"]
-	for idx,item in enumerate(outlyric):
-		outfile = open(outdir + '/' + str(idx) + '.raw',"w")
-		outfile.write(item)
-		outfile.close
-	sys.exit(0)
-
-from Googlyrics import *
-g = Googlyrics()
-
-outlyric = g.find_lyrics(title, artist)
-if len(outlyric) > 0:
-	for idx,item in enumerate(outlyric):
-		l = item.getLyric()
-		if l is not None:
-			if l.lyrics is not None:
-				if len(l.lyrics)>2:
-# if we pipe or write output to a file, python by default recodes into ascii,
-# and sys.stdout.encoding is also set to ascii. But if the system
-# default locale is for example utf-8, we also want the file to be
-# encoded like that
-					outfile = open(outdir + '/' + str(idx) + '.raw',"w")
-					outfile.write(l.lyrics.encode(locale.getdefaultlocale()[1]))
-					outfile.close
-
diff --git a/scripts/muggle_getlyrics b/scripts/muggle_getlyrics
index db7fbdf..f0d5701 100755
--- a/scripts/muggle_getlyrics
+++ b/scripts/muggle_getlyrics
@@ -1,72 +1,126 @@
-#!/bin/sh
-
-# usage:
-# artist title outfile
-#
-# we might find several different lyrics for a song. This is handled
-# by this script: We use a subdirectory outfile.new. If it exists
-# and contains a file, we just return that file and remove it from
-# outfile.new. If outfile.new is empty, we read all versions again
-# with googlyrics.
-
-# ultimately I want to integrate this shell script into mgLyrics.py
-
-artist="$1"
-title="$2"
-outfile="$3"
-orgoutfile=`echo "$3" | sed 's/.tmp$//'`
-outdir="$3.new"
-
-echo > "$outfile".loading
-
-if test ! -d "$outdir"
-then
-	mkdir "$outdir"
-	MGLYRICS="`dirname $0`/mgLyric.py"
-	if test ! -x "$MGLYRICS"
-	then
-		echo $MGLYRICS not found > "$outdir"/1.raw
-	else
-		"$MGLYRICS" "$title" "$artist" "$outdir" 
-	fi
-
-	for i in "$outdir"/*.raw
-	do	
-		test -r "$i" || break
-		cat "$i" |
-		sed 's/\x0d//g' |
-		sed 's/\xc2\xb4/\x27/g' |
-		sed 's/\xc3\x82\x27/\x27/g' |
-		sed 's/\xc3\x82/\x27/g' |
-		sed 's/\xc3\xb9/\x27/g' |
-		sed 's/\xe2\x80\x99/\x27/g' |
-		grep -ive 'NEW.*ringtones' |
-		grep -v -e '--------------' |
-		recode HTML..utf8 |
-		sed 's/\xc2\x91/\x27/g' |	# in unicode, those two are reserved for
-		sed 's/\xc2\x92/\x27/g' |   # private use, but still some sites use them...
-		cat > "$i.converted" 2>/dev/null
-		rm -f "$i"
-		echo cmp -s "$i.converted" "$orgoutfile">>/tmp/log.wr
-		cmp -s "$i.converted" "$orgoutfile"
-		if test $? -eq 0
-		then
-			echo rm -f "$i.converted" >>/tmp/log.wr
-			rm -f "$i.converted"
-			continue
-		fi
-	done
-fi
-
-rmdir "$outdir" 2>/dev/null
-if test -d "$outdir"
-then
-	ls -l  "$outdir"
-	ls "$outdir"/*.converted | head -1 | while read fn
-	do
-		test -r "$fn" || break
-		mv "$fn" "$outfile"
-	done
-	rmdir "$outdir" 2>/dev/null
-fi
-rm -f "$outfile".loading
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import with_statement
+import os, sys, locale, re, codecs
+import filecmp
+from htmlentitydefs import name2codepoint as n2cp
+
+charset = locale.getdefaultlocale()[1]
+title = sys.argv[1].decode(charset).encode('UTF8')
+artist = sys.argv[2].decode(charset).encode('UTF8')
+outfile = sys.argv[3]
+
+lyricsfile = os.path.splitext(outfile)[0]
+outdir = outfile+'.new'
+
+# add other possible paths here:
+googlFound = False
+for scriptdir in ('/usr/share/apps/amarok/scripts/Googlyrics2', \
+               	'NULL'):
+       	if os.path.isdir(scriptdir+'/sites/'):
+               	sys.path.append(scriptdir + "/lib/")
+               	sys.path.append(scriptdir + "/sites/")
+               	os.chdir(scriptdir)
+               	googlFound = True
+               	break 
+
+if not googlFound:
+	writeFile("Googlyrics2 is not installed\nSee http://quicode.com/googlyircs2")
+	sys.exit(0)
+
+from Googlyrics import *
+
+def substitute_entity(match):
+    ent = match.group(2)
+    if match.group(1) == "#":
+        return unichr(int(ent))
+    else:
+        cp = n2cp.get(ent)
+
+        if cp:
+            return unichr(cp)
+        else:
+            return match.group()
+
+def decode_htmlentities(string):
+    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+    return entity_re.subn(substitute_entity, string)[0]
+
+def countFiles():
+	if not os.path.isdir(outdir):
+		return 0
+	return len(os.listdir(outdir))
+
+def writeFile(name,s):
+	if not os.path.isdir(outdir):
+		os.mkdir(outdir)
+	outfile = open(outdir + '/' + name,"w")
+	outfile.write(s)
+	outfile.close
+	
+def load(debug=False):
+	if debug:
+		outlyric=["Version 1","Version 2","Version 3"]
+		for idx,item in enumerate(outlyric):
+			outfile = open(outdir + '/' + str(idx) + '.raw',"w")
+			outfile.write(item)
+			outfile.close
+		sys.exit(0)
+
+	g = Googlyrics()
+
+	outlyric = g.find_lyrics(title, artist)
+	if len(outlyric) > 0:
+		print 'versions:',len(outlyric)
+		for idx,item in enumerate(outlyric):
+			l = item.getLyric()
+			if l is None:
+				continue
+			if l.lyrics is None:
+				print 'lyrics is None'
+				continue
+			if len(l.lyrics)<3:
+				print 'lyrics is too short',l.lyrics
+				continue
+			s = l.lyrics
+			s = decode_htmlentities(s)
+#			s = re.sub(r'&#([0-9]*);',lambda cc: unichr(int(cc.group(1))),s)
+# replace lots of apostrophe variants with want we want:
+			s = s.replace(r'\xc2\xb4',r'\x27')
+			s = s.replace(r'\xc3\x82\x27',r'\x27')
+			s = s.replace(r'\xc3\x82',r'\x27')
+			s = s.replace(r'\xc3\xb9',r'\x27')
+			s = s.replace(r'\xe2\x80',r'x99/\x27')
+			s = s.replace(r'\n.*NEW.*ringtones.*\n',r'\n')
+			s = s.replace(r'\n.*---------.*\n',r'\n')
+# in unicode, those two are reserved for
+# private use, but still some sites use them..
+			s = s.replace(r'\xc2\x91',r'\x27')
+			s = s.replace(r'\xc2\x92',r'\x27')
+			s = s.encode(locale.getdefaultlocale()[1])
+			writeFile(str(idx)+'-'+l.sitename,s)
+
+# -----------------------------------------------
+# main :
+
+semaphore = outfile+'.loading'
+with open(semaphore,"w") as f:
+	f.write('')
+try:
+	if countFiles() == 0:
+		load()
+	if countFiles() > 0:
+		for item in os.listdir(outdir):
+			itemfile = outdir + '/' + item
+			if os.path.exists(lyricsfile):
+				if filecmp.cmp(lyricsfile,itemfile) == 1:
+					print 'removing duplicate ',itemfile
+					os.remove(itemfile)
+	if countFiles() > 0:
+		os.rename(outdir+'/'+os.listdir(outdir)[0],outfile)
+	if countFiles() == 0 and os.path.exists(outdir):
+		os.rmdir(outdir)
+finally:
+	os.remove(semaphore)
+
author	Wolfgang Rohdewald <wolfgang@rohdewald.de>	2009-01-11 18:49:32 +0100
committer	Wolfgang Rohdewald <wolfgang@rohdewald.de>	2009-01-11 18:49:32 +0100
commit	45ccb381fdc28e4a33f8494d6352b64963d30c7b (patch)
tree	994efa4ac06959a08d0b7c07f3556af9980f5a14
parent	343e1dfc3d31e49f74c3d006e3bc4a882c1aa7a7 (diff)
download	vdr-plugin-muggle-45ccb381fdc28e4a33f8494d6352b64963d30c7b.tar.gz vdr-plugin-muggle-45ccb381fdc28e4a33f8494d6352b64963d30c7b.tar.bz2