summaryrefslogtreecommitdiff
path: root/lib/Locale/Maketext/Extract.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Locale/Maketext/Extract.pm')
-rw-r--r--lib/Locale/Maketext/Extract.pm502
1 files changed, 502 insertions, 0 deletions
diff --git a/lib/Locale/Maketext/Extract.pm b/lib/Locale/Maketext/Extract.pm
new file mode 100644
index 0000000..abb3904
--- /dev/null
+++ b/lib/Locale/Maketext/Extract.pm
@@ -0,0 +1,502 @@
+package Locale::Maketext::Extract;
+$Locale::Maketext::Extract::VERSION = '0.07';
+
+use strict;
+
+=head1 NAME
+
+Locale::Maketext::Extract - Extract translatable strings from source
+
+=head1 SYNOPSIS
+
+ my $Ext = Locale::Maketext::Extract->new;
+ $Ext->read_po('messages.po');
+ $Ext->extract_file($_) for <*.pl>;
+ $Ext->compile;
+ $Ext->write_po('messages.po');
+
+=head1 DESCRIPTION
+
+This module can extract translatable strings from files, and write
+them back to PO files. It can also parse existing PO files and merge
+their contents with newly extracted strings.
+
+A command-line utility, L<xgettext.pl>, is installed with this module
+as well.
+
+Following formats of input files are supported:
+
+=over 4
+
+=item Perl source files
+
+Valid localization function names are: C<translate>, C<maketext>,
+C<gettext>, C<loc>, C<x>, C<_> and C<__>.
+
+=item HTML::Mason
+
+Strings inside C<E<lt>&|/lE<gt>I<...>E<lt>/&E<gt>> and
+C<E<lt>&|/locE<gt>I<...>E<lt>/&E<gt>> are extracted.
+
+=item Template Toolkit
+
+Strings inside C<[%|l%]...[%END%]> or C<[%|loc%]...[%END%]>
+are extracted.
+
+=item Text::Template
+
+Sentences between C<STARTxxx> and C<ENDxxx> are extracted individually.
+
+=item Generic Template
+
+Strings inside {{...}} are extracted.
+
+=back
+
+=head1 METHODS
+
+=head2 Constructor
+
+ new
+
+=cut
+
+sub new {
+ my $class = shift;
+ bless({ header => '', entries => {}, lexicon => {}, @_ }, $class);
+}
+
+=head2 Accessors
+
+ header, set_header
+ lexicon, set_lexicon, msgstr, set_msgstr
+ entries, set_entries, entry, add_entry, del_entry
+ clear
+
+=cut
+
+sub header { $_[0]{header} || _default_header() };
+sub set_header { $_[0]{header} = $_[1] };
+
+sub lexicon { $_[0]{lexicon} }
+sub set_lexicon { $_[0]{lexicon} = $_[1] || {}; delete $_[0]{lexicon}{''}; }
+
+sub msgstr { $_[0]{lexicon}{$_[1]} }
+sub set_msgstr { $_[0]{lexicon}{$_[1]} = $_[2] }
+
+sub entries { $_[0]{entries} }
+sub set_entries { $_[0]{entries} = $_[1] || {} }
+
+sub entry { @{$_[0]->entries->{$_[1]} || [] } }
+sub add_entry { push @{$_[0]->entries->{$_[1]}}, $_[2] }
+sub del_entry { delete $_[0]->entries->{$_[1]} }
+
+sub clear {
+ $_[0]->set_header;
+ $_[0]->set_lexicon;
+ $_[0]->set_entries;
+}
+
+=head2 PO File manipulation
+
+ read_po
+ write_po
+
+=cut
+
+sub read_po {
+ my ($self, $file, $verbatim) = @_;
+ my $header = '';
+
+ local *LEXICON;
+ open LEXICON, $file or die $!;
+ while (<LEXICON>) {
+ (1 .. /^$/) or last;
+ $header .= $_;
+ }
+ 1 while chomp $header;
+
+ $self->set_header("$header\n");
+
+ require Locale::Maketext::Lexicon::Gettext;
+ my $lexicon = Locale::Maketext::Lexicon::Gettext->parse(<LEXICON>);
+
+ $self->set_lexicon(
+ $verbatim ? { map _to_gettext($_), %$lexicon } : $lexicon
+ );
+ close LEXICON;
+}
+
+sub write_po {
+ my ($self, $file, $add_format) = @_;
+
+ local *LEXICON;
+ open LEXICON, ">$file" or die "Can't write to $file$!\n";
+
+ print LEXICON $self->header;
+
+ my $sorter = {};
+
+ foreach my $msgid ($self->msgids) {
+ $sorter->{$self->msg_positions($msgid)} = $msgid;
+ }
+
+ foreach my $name (sort keys %$sorter) {
+ my $msgid = $sorter->{$name};
+ $self->normalize_space($msgid);
+ print LEXICON "\n#--------------------\n";
+ print LEXICON $self->msg_positions($msgid);
+ print LEXICON $self->msg_variables($msgid);
+ print LEXICON $self->msg_format($msgid) if $add_format;
+ print LEXICON $self->msg_out($msgid);
+ }
+}
+
+=head2 Extraction
+
+ extract
+ extract_file
+
+=cut
+
+use constant NUL => 0;
+use constant BEG => 1;
+use constant PAR => 2;
+use constant QUO1 => 3;
+use constant QUO2 => 4;
+use constant QUO3 => 5;
+sub extract {
+ my $self = shift;
+ my $file = shift;
+ local $_ = shift;
+
+ my $entries = $self->entries;
+ my $line = 1; pos($_) = 0;
+
+ # Text::Template
+ if (/^STARTTEXT$/m and /^ENDTEXT$/m) {
+ require HTML::Parser;
+ require Lingua::EN::Sentence;
+
+ {
+ package MyParser;
+ @MyParser::ISA = 'HTML::Parser';
+ *{'text'} = sub {
+ my ($self, $str, $is_cdata) = @_;
+ my $sentences = Lingua::EN::Sentence::get_sentences($str) or return;
+ $str =~ s/\n/ /g; $str =~ s/^\s+//; $str =~ s/\s+$//;
+ $self->add_entry($str => [$file, $line]);
+ };
+ }
+
+ my $p = MyParser->new;
+ while (m/\G((.*?)^(?:START|END)[A-Z]+$)/smg) {
+ my ($str) = ($2);
+ $line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
+ $p->parse($str); $p->eof;
+ }
+ $_ = '';
+ }
+
+ # HTML::Mason
+ $line = 1; pos($_) = 0;
+ while (m!\G(.*?<&\|/l(?:oc)?(.*?)&>(.*?)</&>)!sg) {
+ my ($vars, $str) = ($2, $3);
+ $line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
+ $self->add_entry($str, [ $file, $line, $vars ]);
+ }
+
+ # Template Toolkit
+ $line = 1; pos($_) = 0;
+ while (m!\G(.*?\[%\s*\|l(?:oc)?(.*?)\s*%\](.*?)\[%\s*END\s*%\])!sg) {
+ my ($vars, $str) = ($2, $3);
+ $line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
+ $vars =~ s/^\s*\(//;
+ $vars =~ s/\)\s*$//;
+ $self->add_entry($str, [ $file, $line, $vars ]);
+ }
+
+ # Generic Template:
+ $line = 1; pos($_) = 0;
+ while (m/\G(.*?(?<!\{)\{\{(?!\{)(.*?)\}\})/sg) {
+ my ($vars, $str) = ('', $2);
+ $line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
+ $self->add_entry($str, [ $file, $line, $vars ]);
+ }
+
+ my $quoted = '(\')([^\\\']*(?:\\.[^\\\']*)*)(\')|(\")([^\\\"]*(?:\\.[^\\\"]*)*)(\")';
+
+ # Comment-based mark: "..." # loc
+ $line = 1; pos($_) = 0;
+ while (m/\G(.*?($quoted)[\}\)\],]*\s*\#\s*loc\s*$)/smog) {
+ my $str = substr($2, 1, -1);
+ $line += ( () = ( $1 =~ /\n/g ) ); # cryptocontext!
+ $str =~ s/\\(["'])/$1/g;
+ $self->add_entry($str, [ $file, $line, '' ]);
+ }
+
+ # Comment-based pair mark: "..." => "..." # loc_pair
+ $line = 1; pos($_) = 0;
+ while (m/\G(.*?(\w+)\s*=>\s*($quoted)[\}\)\],]*\s*\#\s*loc_pair\s*$)/smg) {
+ my $key = $2;
+ my $val = substr($3, 1, -1);
+ $line += ( () = ( $1 =~ /\n/g ) ); # cryptocontext!
+ $key =~ s/\\(["'])/$1/g;
+ $val =~ s/\\(["'])/$1/g;
+ $self->add_entry($key, [ $file, $line, '' ]);
+ $self->add_entry($val, [ $file, $line, '' ]);
+ }
+
+ # Perl code:
+ my ($state,$str,$vars,$quo)=(0);
+ pos($_) = 0;
+ my $orig = 1 + (() = ((my $__ = $_) =~ /\n/g));
+
+ PARSER: {
+ $_ = substr($_, pos($_)) if (pos($_));
+ my $line = $orig - (() = ((my $__ = $_) =~ /\n/g));
+
+ # maketext or loc or _
+ $state == NUL && m/\b(translate|maketext|gettext|__?|loc|x)/gc
+ && do { $state = BEG; redo };
+ $state == BEG && m/^([\s\t\n]*)/gc && redo;
+
+ # begin ()
+ $state == BEG && m/^([\S\(])\s*/gc
+ && do { $state = ( ($1 eq '(') ? PAR : NUL); redo };
+
+ # begin or end of string
+ $state == PAR && m/^(\')/gc && do { $state = $quo = QUO1; redo };
+ $state == QUO1 && m/^([^\']+)/gc && do { $str .= $1; redo };
+ $state == QUO1 && m/^\'/gc && do { $state = PAR; redo };
+
+ $state == PAR && m/^\"/gc && do { $state = $quo = QUO2; redo };
+ $state == QUO2 && m/^([^\"]+)/gc && do { $str .= $1; redo };
+ $state == QUO2 && m/^\"/gc && do { $state = PAR; redo };
+
+ $state == PAR && m/^\`/gc && do { $state = $quo = QUO3; redo };
+ $state == QUO3 && m/^([^\`]*)/gc && do { $str .= $1; redo };
+ $state == QUO3 && m/^\`/gc && do { $state = PAR; redo };
+
+ # end ()
+ $state == PAR && m/^\s*[\)]/gc && do {
+ $state = NUL;
+ $vars =~ s/[\n\r]//g if ($vars);
+ if ($quo == QUO1) {
+ $str =~ s/\\([\\'])/$1/g; # normalize q strings
+ }
+ else {
+ $str =~ s/(\\(?:[0x]..|c?.))/"qq($1)"/eeg; # normalize qq / qx strings
+ }
+ push @{$entries->{$str}}, [ $file, $line - (() = $str =~ /\n/g), $vars] if ($str);
+ undef $str; undef $vars;
+ redo;
+ };
+
+ # a line of vars
+ $state == PAR && m/^([^\)]*)/gc && do { $vars .= "$1\n"; redo };
+ }
+}
+
+sub extract_file {
+ my ($self, $file) = @_;
+
+ local($/, *FH);
+ open FH, $file or die $!;
+ $self->extract($file => scalar <FH>);
+ close FH;
+}
+
+=head2 Compilation
+
+ compile
+ normalize_space
+
+=cut
+
+sub compile {
+ my ($self, $verbatim) = @_;
+ my $entries = $self->entries;
+ my $lexicon = $self->lexicon;
+
+ foreach my $str (sort keys %$entries) {
+ my $ostr = $str;
+ my $entry = $entries->{$str};
+ my $lexi = $lexicon->{$ostr};
+
+ $str = _to_gettext($str, $verbatim);
+ $lexi = _to_gettext($lexi, $verbatim);
+
+ $lexicon->{$str} ||= '';
+ next if $ostr eq $str;
+
+ $lexicon->{$str} ||= $lexi;
+ delete $entries->{$ostr}; delete $lexicon->{$ostr};
+ $entries->{$str} = $entry;
+ }
+
+ return %$lexicon;
+}
+
+my %Escapes = map {("\\$_" => eval("qq(\\$_)"))} qw(t r f b a e);
+sub normalize_space {
+ my ($self, $msgid) = @_;
+ my $nospace = $msgid;
+ $nospace =~ s/ +$//;
+
+ return unless (!$self->has_msgid($msgid) and $self->has_msgid($nospace));
+
+ $self->set_msgstr(
+ $msgid => $self->msgstr($nospace) .
+ (' ' x (length($msgid) - length($nospace)))
+ );
+}
+
+=head2 Lexicon accessors
+
+ msgids, has_msgid,
+ msgstr, set_msgstr
+ msg_positions, msg_variables, msg_format, msg_out
+
+=cut
+
+sub msgids {
+ sort keys %{$_[0]{lexicon}}
+}
+sub has_msgid { length $_[0]->msgstr($_[1]) }
+
+sub msg_positions {
+ my ($self, $msgid) = @_;
+ my %files = (map { ( " $_->[0]:$_->[1]" => 1 ) } $self->entry($msgid));
+ return join('', '#:', sort(keys %files), "\n");
+}
+
+sub msg_variables {
+ my ($self, $msgid) = @_;
+ my $out = '';
+
+ my %seen;
+ foreach my $entry ( grep { $_->[2] } $self->entry($msgid) ) {
+ my ($file, $line, $var) = @$entry;
+ $var =~ s/^\s*,\s*//; $var =~ s/\s*$//;
+ $out .= "#. ($var)\n" unless !length($var) or $seen{$var}++;
+ }
+
+ return $out;
+}
+
+sub msg_format {
+ my ($self, $msgid) = @_;
+ return "#, perl-maketext-format\n" if $msgid =~ /%(?:\d|\w+\([^\)]*\))/;
+ return '';
+}
+
+sub msg_out {
+ my ($self, $msgid) = @_;
+
+ return "msgid " . _format($msgid) .
+ "msgstr " . _format($self->msgstr($msgid));
+}
+
+=head2 Internal utilities
+
+ _default_header
+ _to_gettext
+ _escape
+ _format
+
+=cut
+
+sub _default_header {
+ return << '.';
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+.
+}
+
+sub _to_gettext {
+ my ($text, $verbatim) = @_;
+ return '' unless defined $text;
+
+ $text =~ s/\\/\\\\/g;
+ $text =~ s/\"/\\"/g;
+
+ while (my ($char, $esc) = each %Escapes) {
+ $text =~ s/$esc/$char/g;
+ }
+ return $text if $verbatim;
+
+ $text =~ s/((?<!~)(?:~~)*)\[_(\d+)\]/$1%$2/g;
+ $text =~ s/((?<!~)(?:~~)*)\[([A-Za-z#*]\w*),([^\]]+)\]/$1%$2("""$3""")/g;
+ $text = join('', map {
+ /^""".*"""$/ ? _escape(substr($_, 3, -3)) : $_
+ } split(/(""".*?""")/, $text));
+
+ $text =~ s/~([\~\[\]])/$1/g;
+ return $text;
+}
+
+sub _escape {
+ my $text = shift;
+ $text =~ s/\b_(\d+)/%$1/g;
+ return $text;
+}
+
+sub _format {
+ my $str = shift;
+ $str =~ s/\\/\\\\/g;
+ $str =~ s/"/\\"/g;
+ return "\"$str\"\n" unless $str =~ /\n/;
+ my $multi_line = ($str =~ /\n(?!\z)/);
+ $str =~ s/\n/\\n"\n"/g;
+ if ($str =~ /\n"$/) {
+ chop $str;
+ }
+ else {
+ $str .= "\"\n";
+ }
+ return $multi_line ? qq(""\n"$str) : qq("$str);
+}
+
+1;
+
+=head1 ACKNOWLEDGMENTS
+
+Thanks to Jesse Vincent for contributing to an early version of this
+module.
+
+Also to Alain Barbet, who effectively re-wrote the source parser with a
+flex-like algorithm.
+
+=head1 SEE ALSO
+
+L<xgettext.pl>, L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
+
+=head1 AUTHORS
+
+Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
+
+=head1 COPYRIGHT
+
+Copyright 2003, 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
+
+This program is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+See L<http://www.perl.com/perl/misc/Artistic.html>
+
+=cut