<= 0 : Preformat entire document
1 : one line triggers
>= 2 : two lines trigger
(default: 2)
=item endpreformat_trigger_lines
endpreformat_trigger_lines=>I
How many lines of unpreformatted-looking text are needed to switch from
<= 0 : Never preformat within document
1 : one line triggers
>= 2 : two lines trigger
(default: 2)
NOTE for preformat_trigger_lines and endpreformat_trigger_lines:
A zero takes precedence. If one is zero, the other is ignored.
If both are zero, entire document is preformatted.
=item preformat_start_marker
preformat_start_marker=>I
What flags the start of a preformatted section if --use_preformat_marker
is true.
(default: "^(:?(:?<)|<)PRE(:?(:?>)|>)\$")
=item preformat_end_marker
preformat_end_marker=>I
What flags the end of a preformatted section if --use_preformat_marker
is true.
(default: "^(:?(:?<)|<)/PRE(:?(:?>)|>)\$")
=item preformat_whitespace_min
preformat_whitespace_min=>I
Minimum number of consecutive whitespace characters to trigger
normal preformatting.
NOTE: Tabs are expanded to spaces before this check is made.
That means if B is 8 and this is 5, then one tab may be
expanded to 8 spaces, which is enough to trigger preformatting.
(default: 5)
=item prepend_file
prepend_file=>I
If you want something prepended to the processed body text, put the
filename here. The prepended text will not be processed at all, so make
sure it's plain text or decent HTML.
(default: nothing)
=item preserve_indent
preserve_indent=>1
Preserve the first-line indentation of paragraphs marked with indents
by replacing the spaces of the first line with non-breaking spaces.
(default: false)
=item short_line_length
short_line_length=>I
Lines this short (or shorter) must be intentionally broken and are kept
that short.
(default: 40)
=item style_url
style_url=>I
This gives the URL of a stylesheet; a LINK tag will be added to the
output.
=item tab_width
tab_width=>I
How many spaces equal a tab?
(default: 8)
=item table_type
table_type=>{ ALIGN=>0, PGSQL=>0, BORDER=>1, DELIM=>0 }
This determines which types of tables will be recognised when "make_tables"
is true. The possible types are ALIGN, PGSQL, BORDER and DELIM.
(default: all types are true)
=item title
title=>I
You can specify a title. Otherwise it will use a blank one.
(default: nothing)
=item titlefirst
titlefirst=>1
Use the first non-blank line as the title.
=item underline_length_tolerance
underline_length_tolerance=>I
How much longer or shorter can underlines be and still be underlines?
(default: 1)
=item underline_offset_tolerance
underline_offset_tolerance=>I
How far offset can underlines be and still be underlines?
(default: 1)
=item unhyphenation
unhyphenation=>0
Enables unhyphenation of text.
(default: true)
=item use_mosaic_header
use_mosaic_header=>1
Use this option if you want to force the heading styles to match what Mosaic
outputs. (Underlined with "***"s is H1,
with "==="s is H2, with "+++" is H3, with "---" is H4, with "~~~" is H5
and with "..." is H6)
This was the behavior of txt2html up to version 1.10.
(default: false)
=item use_preformat_marker
use_preformat_marker=>1
Turn on preformatting when encountering "" on a line by itself, and turn
it off when there's a line containing only "
".
When such preformatted text is detected, the PRE tag will be given the
class 'quote_explicit'.
(default: off)
=item xhtml
xhtml=>1
Try to make the output conform to the XHTML standard, including
closing all open tags and marking empty tags correctly. This
turns on --lower_case_tags and overrides the --doctype option.
Note that if you add a header or a footer file, it is up to you
to make it conform; the header/footer isn't touched by this.
Likewise, if you make link-dictionary entries that break XHTML,
then this won't fix them, except to the degree of putting all tags
into lower-case.
=back
=head1 METHODS
=cut
#------------------------------------------------------------------------
require Exporter;
use Data::Dumper;
our $PROG = 'HTML::TextToHTML';
#------------------------------------------------------------------------
use constant TEXT_TO_HTML => "TEXT_TO_HTML";
########################################
# Definitions (Don't change these)
#
# These are just constants I use for making bit vectors to keep track
# of what modes I'm in and what actions I've taken on the current and
# previous lines.
our $NONE = 0;
our $LIST = 1;
our $HRULE = 2;
our $PAR = 4;
our $PRE = 8;
our $END = 16;
our $BREAK = 32;
our $HEADER = 64;
our $MAILHEADER = 128;
our $MAILQUOTE = 256;
our $CAPS = 512;
our $LINK = 1024;
our $PRE_EXPLICIT = 2048;
our $TABLE = 4096;
our $IND_BREAK = 8192;
our $LIST_START = 16384;
our $LIST_ITEM = 32768;
# Constants for Link-processing
# bit-vectors for what to do with a particular link-dictionary entry
our $LINK_NOCASE = 1;
our $LINK_EVAL = 2;
our $LINK_HTML = 4;
our $LINK_ONCE = 8;
our $LINK_SECT_ONCE = 16;
# Constants for Ordered Lists and Unordered Lists.
# And Definition Lists.
# I use this in the list stack to keep track of what's what.
our $OL = 1;
our $UL = 2;
our $DL = 3;
# Constants for table types
our $TAB_ALIGN = 1;
our $TAB_PGSQL = 2;
our $TAB_BORDER = 3;
our $TAB_DELIM = 4;
# Character entity names
# characters to replace with entities
our %char_entities = (
"\241", "¡", "\242", "¢", "\243", "£",
"\244", "¤", "\245", "¥", "\246", "¦",
"\247", "§", "\250", "¨", "\251", "©",
"\252", "ª", "\253", "«", "\254", "¬",
"\255", "", "\256", "®", "\257", "&hibar;",
"\260", "°", "\261", "±", "\262", "²",
"\263", "³", "\264", "´", "\265", "µ",
"\266", "¶", "\270", "¸", "\271", "¹",
"\272", "º", "\273", "»", "\274", "¼",
"\275", "½", "\276", "¾", "\277", "¿",
"\300", "À", "\301", "Á", "\302", "Â",
"\303", "Ã", "\304", "Ä", "\305", "Å",
"\306", "Æ", "\307", "Ç", "\310", "È",
"\311", "É", "\312", "Ê", "\313", "Ë",
"\314", "Ì", "\315", "Í", "\316", "Î",
"\317", "Ï", "\320", "Ð", "\321", "Ñ",
"\322", "Ò", "\323", "Ó", "\324", "Ô",
"\325", "Õ", "\326", "Ö", "\327", "×",
"\330", "Ø", "\331", "Ù", "\332", "Ú",
"\333", "Û", "\334", "Ü", "\335", "Ý",
"\336", "Þ", "\337", "ß", "\340", "à",
"\341", "á", "\342", "â", "\343", "ã",
"\344", "ä", "\345", "å", "\346", "æ",
"\347", "ç", "\350", "è", "\351", "é",
"\352", "ê", "\353", "ë", "\354", "ì",
"\355", "í", "\356", "î", "\357", "ï",
"\360", "ð", "\361", "ñ", "\362", "ò",
"\363", "ó", "\364", "ô", "\365", "õ",
"\366", "ö", "\367", "÷", "\370", "ø",
"\371", "ù", "\372", "ú", "\373", "û",
"\374", "ü", "\375", "ý", "\376", "þ",
"\377", "ÿ", "\267", "·",
);
# alignments for tables
our @alignments = ('', '', ' ALIGN="RIGHT"', ' ALIGN="CENTER"');
our @lc_alignments = ('', '', ' align="right"', ' align="center"');
our @xhtml_alignments =
('', '', ' style="text-align: right;"', ' style="text-align: center;"');
#---------------------------------------------------------------#
# Object interface
#---------------------------------------------------------------#
=head2 new
$conv = new HTML::TextToHTML()
$conv = new HTML::TextToHTML(titlefirst=>1,
...
);
Create a new object with new. If one argument is given, it is assumed
to be a reference to an array of arguments. If more than one argument
is given, it is assumed to be a hash of arguments. These arguments will
be used in invocations of other methods.
See L for the possible values of the arguments.
=cut
sub new
{
my $invocant = shift;
my $self = {};
my $class = ref($invocant) || $invocant; # Object or class name
init_our_data($self);
# bless self
bless($self, $class);
$self->args(@_);
return $self;
} # new
=head2 args
$conv->args(short_line_length=>60,
titlefirst=>1,
....
);
Updates the current arguments/options of the HTML::TextToHTML object.
Takes either a hash, or a reference to an array of arguments, which will
be used in invocations of other methods.
See L for the possible values of the arguments.
NOTE: the reference-to-an-array usage is DEPRECATED and will be removed
in the future.
=cut
sub args
{
my $self = shift;
my %args = ();
my @arg_array = ();
if ( @_
&& @_ == 1
&& ref $_[0] eq 'ARRAY')
{
# this is a reference to an array -- use the old style args
my $aref = shift;
@arg_array = @{$aref};
}
elsif (@_)
{
%args = @_;
}
if (%args)
{
if ($self->{debug})
{
print STDERR "========args(hash)========\n";
print STDERR Dumper(%args);
}
foreach my $arg (keys %args)
{
if (defined $args{$arg})
{
if ($arg =~ /^-/)
{
$arg =~ s/^-//; # get rid of first dash
$arg =~ s/^-//; # get rid of possible second dash
}
if ($self->{debug})
{
print STDERR "--", $arg;
}
$self->{$arg} = $args{$arg};
if ($self->{debug})
{
print STDERR " ", $args{$arg}, "\n";
}
}
}
}
elsif (@arg_array)
{
if ($self->{debug})
{
print STDERR "========args(array)========\n";
print STDERR Dumper(@arg_array);
}
# the arg array may have filenames at the end of it,
# so don't consume them
my $look_at_args = 1;
while (@arg_array && $look_at_args)
{
my $arg = shift @arg_array;
# check for arguments which are bools,
# and thus have no companion value
if ($arg =~ /^-/)
{
$arg =~ s/^-//; # get rid of first dash
$arg =~ s/^-//; # get rid of possible second dash
if ($self->{debug})
{
print STDERR "--", $arg;
}
if ( $arg eq 'debug'
|| $arg eq 'demoronize'
|| $arg eq 'eight_bit_clean'
|| $arg eq 'escape_HTML_chars'
|| $arg eq 'explicit_headings'
|| $arg eq 'extract'
|| $arg eq 'link_only'
|| $arg eq 'lower_case_tags'
|| $arg eq 'mailmode'
|| $arg eq 'make_anchors'
|| $arg eq 'make_links'
|| $arg eq 'make_tables'
|| $arg eq 'preserve_indent'
|| $arg eq 'titlefirst'
|| $arg eq 'unhyphenation'
|| $arg eq 'use_mosaic_header'
|| $arg eq 'use_preformat_marker'
|| $arg eq 'verbose'
|| $arg eq 'xhtml')
{
$self->{$arg} = 1;
if ($self->{debug})
{
print STDERR "=true\n";
}
}
elsif ($arg eq 'nodebug'
|| $arg eq 'nodemoronize'
|| $arg eq 'noeight_bit_clean'
|| $arg eq 'noescape_HTML_chars'
|| $arg eq 'noexplicit_headings'
|| $arg eq 'noextract'
|| $arg eq 'nolink_only'
|| $arg eq 'nolower_case_tags'
|| $arg eq 'nomailmode'
|| $arg eq 'nomake_anchors'
|| $arg eq 'nomake_links'
|| $arg eq 'nomake_tables'
|| $arg eq 'nopreserve_indent'
|| $arg eq 'notitlefirst'
|| $arg eq 'nounhyphenation'
|| $arg eq 'nouse_mosaic_header'
|| $arg eq 'nouse_preformat_marker'
|| $arg eq 'noverbose'
|| $arg eq 'noxhtml')
{
$arg =~ s/^no//;
$self->{$arg} = 0;
if ($self->{debug})
{
print STDERR " $arg=false\n";
}
}
else
{
my $val = shift @arg_array;
if ($self->{debug})
{
print STDERR "=", $val, "\n";
}
# check the types
if (defined $arg && defined $val)
{
if ( $arg eq 'infile'
|| $arg eq 'custom_heading_regexp'
|| $arg eq 'links_dictionaries')
{ # arrays
if ($val eq 'CLEAR')
{
$self->{$arg} = [];
}
else
{
push @{$self->{$arg}}, $val;
}
}
elsif ($arg eq 'file')
{ # alternate for 'infile'
if ($val eq 'CLEAR')
{
$self->{infile} = [];
}
else
{
push @{$self->{infile}}, $val;
}
}
elsif ($arg eq 'table_type')
{
# hash
if ($val eq 'CLEAR')
{
$self->{$arg} = {};
}
else
{
my ($f1, $v1) = split(/=/, $val, 2);
$self->{$arg}->{$f1} = $v1;
}
}
else
{
$self->{$arg} = $val;
}
}
}
}
else
{
# if an option don't start with - then we've
# come to the end of the options
$look_at_args = 0;
}
}
}
if ($self->{debug})
{
print STDERR Dumper($self);
}
return 1;
} # args
=head2 process_chunk
$newstring = $conv->process_chunk($mystring);
Convert a string to a HTML fragment. This assumes that this string is
at the least, a single paragraph, but it can contain more than that.
This returns the processed string. If you want to pass arguments to
alter the behaviour of this conversion, you need to do that earlier,
either when you create the object, or with the L method.
$newstring = $conv->process_chunk($mystring,
close_tags=>0);
If there are open tags (such as lists) in the input string,
process_chunk will now automatically close them, unless you specify not
to, with the close_tags option.
$newstring = $conv->process_chunk($mystring,
is_fragment=>1);
If you want this string to be treated as a fragment, and not assumed to
be a paragraph, set is_fragment to true. If there is more than one
paragraph in the string (ie it contains blank lines) then this option
will be ignored.
=cut
sub process_chunk ($$;%)
{
my $self = shift;
my $chunk = shift;
my %args = (
close_tags => 1,
is_fragment => 0,
@_
);
my $ret_str = '';
my @paras = split(/\r?\n\r?\n/, $chunk);
my $ind = 0;
if (@paras == 1) # just one paragraph
{
$ret_str .= $self->process_para(
$chunk,
close_tags => $args{close_tags},
is_fragment => $args{is_fragment}
);
}
else
{
my $ind = 0;
foreach my $para (@paras)
{
# if the paragraph doesn't end with a newline, add one
$para .= "\n" if ($para !~ /\n$/);
if ($ind == @paras - 1) # last one
{
$ret_str .= $self->process_para(
$para,
close_tags => $args{close_tags},
is_fragment => 0
);
}
else
{
$ret_str .= $self->process_para(
$para,
close_tags => 0,
is_fragment => 0
);
}
$ind++;
}
}
$ret_str;
} # process_chunk
=head2 process_para
$newstring = $conv->process_para($mystring);
Convert a string to a HTML fragment. This assumes that this string is
at the most a single paragraph, with no blank lines in it. If you don't
know whether your string will contain blank lines or not, use the
L method instead.
This returns the processed string. If you want to pass arguments to
alter the behaviour of this conversion, you need to do that earlier,
either when you create the object, or with the L method.
$newstring = $conv->process_para($mystring,
close_tags=>0);
If there are open tags (such as lists) in the input string, process_para
will now automatically close them, unless you specify not to, with the
close_tags option.
$newstring = $conv->process_para($mystring,
is_fragment=>1);
If you want this string to be treated as a fragment, and not assumed to be
a paragraph, set is_fragment to true.
=cut
sub process_para ($$;%)
{
my $self = shift;
my $para = shift;
my %args = (
close_tags => 1,
is_fragment => 0,
@_
);
# if this is an external call, do certain initializations
$self->do_init_call();
my $para_action = $NONE;
# tables and mailheaders don't carry over from one para to the next
if ($self->{__mode} & $TABLE)
{
$self->{__mode} ^= $TABLE;
}
if ($self->{__mode} & $MAILHEADER)
{
$self->{__mode} ^= $MAILHEADER;
}
# convert Microsoft character codes into sensible characters
if ($self->{demoronize})
{
demoronize_char($para);
}
# if we are not just linking, we are discerning structure
if (!$self->{link_only})
{
# Chop trailing whitespace and DOS CRs
$para =~ s/[ \011]*\015$//;
# Chop leading whitespace and DOS CRs
$para =~ s/^[ \011]*\015//;
$para =~ s/\r//g; # remove any stray carriage returns
my @done_lines = (); # lines which have been processed
# The PRE_EXPLICIT structure can carry over from one
# paragraph to the next, but it is ended with the
# explicit end-tag designated for it.
# Therefore we can shortcut for this by checking
# for the end of the PRE_EXPLICIT and chomping off
# the preformatted string part of this para before
# we have to split it into lines.
# Note that after this check, we could *still* be
# in PRE_EXPLICIT mode.
if ($self->{__mode} & $PRE_EXPLICIT)
{
my $pre_str =
$self->split_end_explicit_preformat(para_ref => \$para);
if ($pre_str)
{
push @done_lines, $pre_str;
}
}
if (defined $para && $para ne "")
{
#
# Now we split the paragraph into lines
#
my $para_len = length($para);
my @para_lines = split(/^/, $para);
my @para_line_len = ();
my @para_line_indent = ();
my @para_line_action = ();
my $i = 0;
foreach my $line (@para_lines)
{
# Change all tabs to spaces
while ($line =~ /\011/)
{
my $tw = $self->{tab_width};
$line =~ s/\011/" " x ($tw - (length($`) % $tw))/e;
}
push @para_line_len, length($line);
if ($line =~ /^\s*$/)
{
# if the line is blank, use the previous indent
# if there is one
push @para_line_indent,
($i == 0 ? 0 : $para_line_indent[$i - 1]);
}
else
{
# count the number of leading spaces
my ($ws) = $line =~ /^( *)[^ ]/;
push @para_line_indent, length($ws);
}
push @para_line_action, $NONE;
$i++;
}
# There are two more structures which carry over from one
# paragraph to the next: LIST, PRE
# There are also certain things which will immediately end
# multi-paragraph LIST and PRE, if found at the start
# of a paragraph:
# A list will be ended by
# TABLE, MAILHEADER, HEADER, custom-header
# A PRE will be ended by
# TABLE, MAILHEADER and non-pre text
my $is_table = 0;
my $table_type = 0;
my $is_mailheader = 0;
my $is_header = 0;
my $is_custom_header = 0;
if (@{$self->{custom_heading_regexp}})
{
$is_custom_header =
$self->is_custom_heading(line => $para_lines[0]);
}
if ( $self->{make_tables}
&& @para_lines > 1)
{
$table_type = $self->get_table_type(
rows_ref => \@para_lines,
para_len => $para_len
);
$is_table = ($table_type != 0);
}
if ( !$self->{explicit_headings}
&& @para_lines > 1
&& !$is_table)
{
$is_header = $self->is_heading(
line_ref => \$para_lines[0],
next_ref => \$para_lines[1]
);
}
# Note that it is concievable that someone has
# partially disabled mailmode by making a custom header
# which matches the start of mail.
# This is stupid, but allowable, so we check.
if ( $self->{mailmode}
&& !$is_table
&& !$is_custom_header)
{
$is_mailheader = $self->is_mailheader(rows_ref => \@para_lines);
}
# end the list if we can end it
if (
($self->{__mode} & $LIST)
&& ( $is_table
|| $is_mailheader
|| $is_header
|| $is_custom_header)
)
{
my $list_end = '';
my $action = 0;
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => \$list_end,
line_action_ref => \$action
);
push @done_lines, $list_end;
$self->{__prev_para_action} |= $END;
}
# end the PRE if we can end it
if (
($self->{__mode} & $PRE)
&& !($self->{__mode} & $PRE_EXPLICIT)
&& ( $is_table
|| $is_mailheader
|| !$self->is_preformatted($para_lines[0]))
&& ($self->{preformat_trigger_lines} != 0)
)
{
my $pre_end = '';
my $tag = $self->close_tag('PRE');
$pre_end = "${tag}\n";
$self->{__mode} ^= ($PRE & $self->{__mode});
push @done_lines, $pre_end;
$self->{__prev_para_action} |= $END;
}
# The PRE and PRE_EXPLICIT structure can carry over
# from one paragraph to the next, but because we don't
# want trailing newlines, such newlines would have been
# gotten rid of in the previous call. However, with
# a preformatted text, we do want the blank lines in it
# to be preserved, so let's add a blank line in here.
if ($self->{__mode} & $PRE)
{
push @done_lines, "\n";
}
# Now, we do certain things which are only found at the
# start of a paragraph:
# HEADER, custom-header, TABLE and MAILHEADER
# These could concievably eat the rest of the paragraph.
if ($is_custom_header)
{
# custom header eats the first line
my $header = shift @para_lines;
shift @para_line_len;
shift @para_line_indent;
shift @para_line_action;
$self->custom_heading(line_ref => \$header);
push @done_lines, $header;
$self->{__prev_para_action} |= $HEADER;
}
elsif ($is_header)
{
# normal header eats the first two lines
my $header = shift @para_lines;
shift @para_line_len;
shift @para_line_indent;
shift @para_line_action;
my $underline = shift @para_lines;
shift @para_line_len;
shift @para_line_indent;
shift @para_line_action;
$self->heading(
line_ref => \$header,
next_ref => \$underline
);
push @done_lines, $header;
$self->{__prev_para_action} |= $HEADER;
}
# do the table stuff on the array of lines
if ($self->{make_tables} && $is_table)
{
if (
$self->tablestuff(
table_type => $table_type,
rows_ref => \@para_lines,
para_len => $para_len
)
)
{
# this has used up all the lines
push @done_lines, @para_lines;
@para_lines = ();
}
}
# check of this para is a mail-header
if ( $is_mailheader
&& !($self->{__mode} & $TABLE)
&& @para_lines)
{
$self->mailheader(rows_ref => \@para_lines);
# this has used up all the lines
push @done_lines, @para_lines;
@para_lines = ();
}
#
# Now go through the paragraph lines one at a time
# Note that we won't have TABLE, MAILHEADER, HEADER modes
# because they would have eaten the lines
#
my $prev = '';
my $prev_action = $self->{__prev_para_action};
for (my $i = 0; $i < @para_lines; $i++)
{
my $prev_ref;
my $prev_action_ref;
my $prev_line_indent;
my $prev_line_len;
if ($i == 0)
{
$prev_ref = \$prev;
$prev_action_ref = \$prev_action;
$prev_line_indent = 0;
$prev_line_len = 0;
}
else
{
$prev_ref = \$para_lines[$i - 1];
$prev_action_ref = \$para_line_action[$i - 1];
$prev_line_indent = $para_line_indent[$i - 1];
$prev_line_len = $para_line_len[$i - 1];
}
my $next_ref;
if ($i == $#para_lines)
{
$next_ref = undef;
}
else
{
$next_ref = \$para_lines[$i + 1];
}
$para_lines[$i] = escape($para_lines[$i])
if ($self->{escape_HTML_chars});
if ($self->{mailmode}
&& !($self->{__mode} & ($PRE_EXPLICIT)))
{
$self->mailquote(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
prev_action_ref => $prev_action_ref,
next_ref => $next_ref
);
}
if ( ($self->{__mode} & $PRE)
&& ($self->{preformat_trigger_lines} != 0))
{
$self->endpreformat(
para_lines_ref => \@para_lines,
para_action_ref => \@para_line_action,
ind => $i,
prev_ref => $prev_ref
);
}
if (!($self->{__mode} & $PRE))
{
$self->hrule(
para_lines_ref => \@para_lines,
para_action_ref => \@para_line_action,
ind => $i
);
}
if (!($self->{__mode} & ($PRE))
&& ($para_lines[$i] !~ /^\s*$/))
{
$self->liststuff(
para_lines_ref => \@para_lines,
para_action_ref => \@para_line_action,
para_line_indent_ref => \@para_line_indent,
ind => $i,
prev_ref => $prev_ref
);
}
if ( !($para_line_action[$i] & ($HEADER | $LIST))
&& !($self->{__mode} & ($LIST | $PRE))
&& $self->{__preformat_enabled})
{
$self->preformat(
mode_ref => \$self->{__mode},
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
next_ref => $next_ref,
prev_action_ref => $prev_action_ref
);
}
if (!($self->{__mode} & ($PRE)))
{
$self->paragraph(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
prev_action_ref => $prev_action_ref,
line_indent => $para_line_indent[$i],
prev_indent => $prev_line_indent,
is_fragment => $args{is_fragment},
ind => $i,
);
}
if (!($self->{__mode} & ($PRE | $LIST)))
{
$self->shortline(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
prev_action_ref => $prev_action_ref,
prev_line_len => $prev_line_len
);
}
if (!($self->{__mode} & ($PRE)))
{
$self->caps(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i]
);
}
# put the "prev" line in front of the first line
$para_lines[$i] = $prev . $para_lines[$i]
if ($i == 0 && ($prev !~ /^\s*$/));
}
# para action is the action of the last line of the para
$para_action = $para_line_action[$#para_line_action];
$para_action = $NONE if (!defined $para_action);
# push them on the done lines
push @done_lines, @para_lines;
@para_lines = ();
}
# now put the para back together as one string
$para = join('', @done_lines);
# if this is a paragraph, and we are in XHTML mode,
# close an open paragraph.
if ($self->{xhtml})
{
my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
if (defined $open_tag && $open_tag eq 'P')
{
$para .= $self->close_tag('P');
}
}
if (
$self->{unhyphenation}
# ends in hyphen & next line starts w/letters
&& ($para =~ /[^\W\d_]\-\n\s*[^\W\d_]/s) && !(
$self->{__mode} &
($PRE | $HEADER | $MAILHEADER | $TABLE | $BREAK)
)
)
{
$self->unhyphenate_para(\$para);
}
# chop trailing newlines for continuing lists and PRE
if ( $self->{__mode} & $LIST
|| $self->{__mode} & $PRE)
{
$para =~ s/\n$//g;
}
}
# apply links and bold/italic formatting
if ($para !~ /^\s*$/)
{
$self->apply_links(
para_ref => \$para,
para_action_ref => \$para_action
);
}
# close any open lists if required to
if ( $args{close_tags}
&& $self->{__mode} & $LIST) # End all lists
{
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => \$para,
line_action_ref => \$para_action
);
}
# close any open tags
if ($args{close_tags} && $self->{xhtml})
{
while (@{$self->{__tags}})
{
$para .= $self->close_tag('');
}
}
# convert remaining Microsoft character codes into sensible HTML
if ($self->{demoronize})
{
$para = demoronize_code($para);
}
# All the matching and formatting is done. Now we can
# replace non-ASCII characters with character entities.
if (!$self->{eight_bit_clean})
{
my @chars = split(//, $para);
foreach $_ (@chars)
{
$_ = $char_entities{$_} if defined($char_entities{$_});
}
$para = join('', @chars);
}
$self->{__prev_para_action} = $para_action;
return $para;
} # process_para
=head2 txt2html
$conv->txt2html(%args);
Convert a text file to HTML. Takes a hash of arguments, or a reference
to an array of arguments to customize the conversion; (this includes
saying what file to convert!) See L for the possible values of
the arguments. Arguments which have already been set with B or
B will remain as they are, unless they are overridden.
=cut
sub txt2html ($;$)
{
my $self = shift;
if (@_)
{
$self->args(@_);
}
$self->do_init_call();
my $outhandle;
my $outhandle_needs_closing;
# set up the output
if ($self->{outhandle})
{
$outhandle = $self->{outhandle};
$outhandle_needs_closing = 1;
}
elsif ($self->{outfile} eq "-")
{
$outhandle = *STDOUT;
$outhandle_needs_closing = 0;
}
else
{
open($outhandle, "> " . $self->{outfile})
|| die "Error: unable to open ", $self->{outfile}, ": $!\n";
$outhandle_needs_closing = 1;
}
# slurp up a paragraph at a time, a file at a time
local $/ = "";
my $para = '';
my $count = 0;
my $print_count = 0;
my @sources = ();
my $source_type;
if ($self->{infile} and @{$self->{infile}})
{
@sources = @{$self->{infile}};
$source_type = 'file';
}
elsif ($self->{inhandle} and @{$self->{inhandle}})
{
@sources = @{$self->{inhandle}};
$source_type = 'filehandle';
}
elsif ($self->{instring} and @{$self->{instring}})
{
@sources = @{$self->{instring}};
$source_type = 'string';
}
my $inhandle;
my $inhandle_needs_closing = 0;
foreach my $source (@sources)
{
$inhandle = undef;
if ($source_type eq 'file')
{
if (!$source or $source eq '-')
{
$inhandle = *STDIN;
$inhandle_needs_closing = 0;
}
else
{
if (-f $source && open($inhandle, $source))
{
$inhandle_needs_closing = 1;
}
else # error
{
warn "Could not open $source\n";
next;
}
}
}
elsif ($source_type eq 'filehandle')
{
$inhandle = $source;
$inhandle_needs_closing = 1;
}
if ($source_type eq 'string')
{
# process the string
$para = $_;
$para =~ s/\n$//; # trim the endline
if ($count == 0)
{
$self->do_file_start($outhandle, $para);
}
$self->{__done_with_sect_link} = [];
$para = $self->process_chunk($para, close_tags => 0);
print $outhandle $para, "\n";
$print_count++;
$count++;
}
else # file or filehandle
{
while (<$inhandle>)
{
$para = $_;
$para =~ s/\n$//; # trim the endline
if ($count == 0)
{
$self->do_file_start($outhandle, $para);
}
$self->{__done_with_sect_link} = [];
$para = $self->process_chunk($para, close_tags => 0);
print $outhandle $para, "\n";
$print_count++;
$count++;
}
if ($inhandle_needs_closing)
{
close($inhandle);
}
}
} # for each file
$self->{__prev} = "";
if ($self->{__mode} & $LIST) # End all lists
{
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => \$self->{__prev},
line_action_ref => \$self->{__line_action}
);
}
print $outhandle $self->{__prev};
# end open preformats
if ($self->{__mode} & $PRE)
{
my $tag = $self->close_tag('PRE');
print $outhandle $tag;
}
# close all open tags
if ( $self->{xhtml}
&& !$self->{extract}
&& @{$self->{__tags}})
{
if ($self->{dict_debug} & 8)
{
print STDERR "closing all tags at end\n";
}
# close any open tags (until we get to the body)
my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
while (@{$self->{__tags}}
&& $open_tag ne 'BODY'
&& $open_tag ne 'HTML')
{
print $outhandle $self->close_tag('');
$open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
}
print $outhandle "\n";
}
if ($self->{append_file})
{
if (-r $self->{append_file})
{
open(APPEND, $self->{append_file});
while ()
{
print $outhandle $_;
$print_count++;
}
close(APPEND);
}
else
{
print STDERR "Can't find or read file ", $self->{append_file},
" to append.\n";
}
}
# print the closing tags (if we have printed stuff at all)
if ($print_count && !$self->{extract})
{
print $outhandle $self->close_tag('BODY'), "\n";
print $outhandle $self->close_tag('HTML'), "\n";
}
if ($outhandle_needs_closing)
{
close($outhandle);
}
return 1;
}
#---------------------------------------------------------------#
# Init-related subroutines
#--------------------------------#
# Name: init_our_data
# Args:
# $self
sub init_our_data ($)
{
my $self = shift;
$self->{debug} = 0;
#
# All the options, in alphabetical order
#
$self->{append_file} = '';
$self->{append_head} = '';
$self->{body_deco} = '';
$self->{bullets} = '-=o*\267';
$self->{bullets_ordered} = '';
$self->{bold_delimiter} = '#';
$self->{caps_tag} = 'STRONG';
$self->{custom_heading_regexp} = [];
$self->{default_link_dict} =
($ENV{HOME} ? "$ENV{HOME}/.txt2html.dict" : '.txt2html.dict');
$self->{dict_debug} = 0;
$self->{doctype} = "-//W3C//DTD HTML 3.2 Final//EN";
$self->{demoronize} = 1;
$self->{eight_bit_clean} = 0;
$self->{escape_HTML_chars} = 1;
$self->{explicit_headings} = 0;
$self->{extract} = 0;
$self->{hrule_min} = 4;
$self->{indent_width} = 2;
$self->{indent_par_break} = 0;
$self->{infile} = [];
$self->{inhandle} = [];
$self->{instring} = [];
$self->{italic_delimiter} = '*';
$self->{links_dictionaries} = [];
$self->{link_only} = 0;
$self->{lower_case_tags} = 0;
$self->{mailmode} = 0;
$self->{make_anchors} = 1;
$self->{make_links} = 1;
$self->{make_tables} = 0;
$self->{min_caps_length} = 3;
$self->{outfile} = '-';
$self->{par_indent} = 2;
$self->{preformat_trigger_lines} = 2;
$self->{endpreformat_trigger_lines} = 2;
$self->{preformat_start_marker} = "^(:?(:?<)|<)PRE(:?(:?>)|>)\$";
$self->{preformat_end_marker} = "^(:?(:?<)|<)/PRE(:?(:?>)|>)\$";
$self->{preformat_whitespace_min} = 5;
$self->{prepend_file} = '';
$self->{preserve_indent} = 0;
$self->{short_line_length} = 40;
$self->{style_url} = '';
$self->{tab_width} = 8;
$self->{table_type} = {
ALIGN => 1,
PGSQL => 1,
BORDER => 1,
DELIM => 1,
};
$self->{title} = '';
$self->{titlefirst} = 0;
$self->{underline_length_tolerance} = 1;
$self->{underline_offset_tolerance} = 1;
$self->{unhyphenation} = 1;
$self->{use_mosaic_header} = 0;
$self->{use_preformat_marker} = 0;
$self->{xhtml} = 0;
# accumulation variables
$self->{__file} = ""; # Current file being processed
$self->{__heading_styles} = {};
$self->{__num_heading_styles} = 0;
$self->{__links_table} = {};
$self->{__links_table_order} = [];
$self->{__search_patterns} = [];
$self->{__repl_code} = [];
$self->{__prev_para_action} = 0;
$self->{__non_header_anchor} = 0;
$self->{__mode} = 0;
$self->{__listnum} = 0;
$self->{__list_nice_indent} = "";
$self->{__list_indent} = [];
$self->{__call_init_done} = 0;
#
# The global links data
#
# This is stored in the DATA handle, after the __DATA__ at
# the end of this file; but because the test scripts (and possibly
# other scripts) don't just create one instance of this object,
# we have to remember the position of the DATA handle
# and reset it after we've read from it, just in case
# we have to read from it again.
# This also means that we don't close it, either. Hope that doesn't
# cause a problem...
#
my $curpos = tell(DATA); # remember the __DATA__ position
my @lines = ();
while ()
{
# skip lines that start with '#'
next if /^\#/;
# skip lines that end with unescaped ':'
next if /^.*[^\\]:\s*$/;
push @lines, $_;
}
# reset the data handle to the start, just in case
seek(DATA, $curpos, 0);
$self->{__global_links_data} = join('', @lines);
} # init_our_data
#---------------------------------------------------------------#
# txt2html-related subroutines
#--------------------------------#
# Name: deal_with_options
# do extra processing related to particular options
# Args:
# $self
sub deal_with_options ($)
{
my $self = shift;
if ($self->{links_dictionaries})
{
# only put into the links dictionaries files which are readable
my @dict_files = @{$self->{links_dictionaries}};
$self->args(links_dictionaries => []);
foreach my $ld (@dict_files)
{
if (-r $ld)
{
$self->{'make_links'} = 1;
$self->args(['--links_dictionaries', $ld]);
}
else
{
print STDERR "Can't find or read link-file $ld\n";
}
}
}
if (!$self->{make_links})
{
$self->{'links_dictionaries'} = 0;
}
if ($self->{append_file})
{
if (!-r $self->{append_file})
{
print STDERR "Can't find or read ", $self->{append_file}, "\n";
$self->{append_file} = '';
}
}
if ($self->{prepend_file})
{
if (!-r $self->{prepend_file})
{
print STDERR "Can't find or read ", $self->{prepend_file}, "\n";
$self->{'prepend_file'} = '';
}
}
if ($self->{append_head})
{
if (!-r $self->{append_head})
{
print STDERR "Can't find or read ", $self->{append_head}, "\n";
$self->{'append_head'} = '';
}
}
if (!$self->{outfile})
{
$self->{'outfile'} = "-";
}
$self->{'preformat_trigger_lines'} = 0
if ($self->{preformat_trigger_lines} < 0);
$self->{'preformat_trigger_lines'} = 2
if ($self->{preformat_trigger_lines} > 2);
$self->{'endpreformat_trigger_lines'} = 1
if ($self->{preformat_trigger_lines} == 0);
$self->{'endpreformat_trigger_lines'} = 0
if ($self->{endpreformat_trigger_lines} < 0);
$self->{'endpreformat_trigger_lines'} = 2
if ($self->{endpreformat_trigger_lines} > 2);
$self->{__preformat_enabled} =
(($self->{endpreformat_trigger_lines} != 0)
|| $self->{use_preformat_marker});
if ($self->{use_mosaic_header})
{
my $num_heading_styles = 0;
my %heading_styles = ();
$heading_styles{"*"} = ++$num_heading_styles;
$heading_styles{"="} = ++$num_heading_styles;
$heading_styles{"+"} = ++$num_heading_styles;
$heading_styles{"-"} = ++$num_heading_styles;
$heading_styles{"~"} = ++$num_heading_styles;
$heading_styles{"."} = ++$num_heading_styles;
$self->{__heading_styles} = \%heading_styles;
$self->{__num_heading_styles} = $num_heading_styles;
}
# XHTML implies lower case
$self->{'lower_case_tags'} = 1 if ($self->{xhtml});
}
sub escape ($)
{
my ($text) = @_;
$text =~ s/&/&/g;
$text =~ s/>/>/g;
$text =~ s/</g;
return $text;
}
# Added by Alan Jackson, alan at ajackson dot org, and based
# on the demoronize script by John Walker, http://www.fourmilab.ch/
# Convert Microsoft character entities into characters.
sub demoronize_char($)
{
my $s = shift;
# Map strategically incompatible non-ISO characters in the
# range 0x82 -- 0x9F into plausible substitutes where
# possible.
$s =~ s/\x82/,/g;
$s =~ s/\x84/,,/g;
$s =~ s/\x85/.../g;
$s =~ s/\x88/^/g;
$s =~ s/\x8B//g;
$s =~ s/\x9C/oe/g;
return $s;
}
# convert Microsoft character entities into HTML code
sub demoronize_code($)
{
my $s = shift;
# Map strategically incompatible non-ISO characters in the
# range 0x82 -- 0x9F into plausible substitutes where
# possible.
$s =~ s-\x83-f-g;
$s =~ s-\x98-~-g;
$s =~ s-\x99-TM-g;
return $s;
}
# output the tag wanted (add the <> and the / if necessary)
# - output in lower or upper case
# - do tag-related processing
# options:
# tag_type=>'start' | tag_type=>'end' | tag_type=>'empty'
# (default start)
# inside_tag=>string (default empty)
sub get_tag ($$;%)
{
my $self = shift;
my $in_tag = shift;
my %args = (
tag_type => 'start',
inside_tag => '',
@_
);
my $inside_tag = $args{inside_tag};
my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
if (!defined $open_tag)
{
$open_tag = '';
}
# close any open tags that need closing
# Note that we only have to check for the structural tags we make,
# not every possible HTML tag
my $tag_prefix = '';
if ($self->{xhtml})
{
if ( $open_tag eq 'P'
and $in_tag eq 'P'
and $args{tag_type} ne 'end')
{
$tag_prefix = $self->close_tag('P');
}
elsif ( $open_tag eq 'P'
and $in_tag =~ /^(HR|UL|OL|DL|PRE|TABLE|H)/)
{
$tag_prefix = $self->close_tag('P');
}
elsif ( $open_tag eq 'LI'
and $in_tag eq 'LI'
and $args{tag_type} ne 'end')
{
# close a LI before the next LI
$tag_prefix = $self->close_tag('LI');
}
elsif ( $open_tag eq 'LI'
and $in_tag =~ /^(UL|OL)$/
and $args{tag_type} eq 'end')
{
# close the LI before the list closes
$tag_prefix = $self->close_tag('LI');
}
elsif ( $open_tag eq 'DT'
and $in_tag eq 'DD'
and $args{tag_type} ne 'end')
{
# close a DT before the next DD
$tag_prefix = $self->close_tag('DT');
}
elsif ( $open_tag eq 'DD'
and $in_tag eq 'DT'
and $args{tag_type} ne 'end')
{
# close a DD before the next DT
$tag_prefix = $self->close_tag('DD');
}
elsif ( $open_tag eq 'DD'
and $in_tag eq 'DL'
and $args{tag_type} eq 'end')
{
# close the DD before the list closes
$tag_prefix = $self->close_tag('DD');
}
}
my $out_tag = $in_tag;
if ($args{tag_type} eq 'end')
{
$out_tag = $self->close_tag($in_tag);
}
else
{
if ($self->{lower_case_tags})
{
$out_tag =~ tr/A-Z/a-z/;
}
else # upper case
{
$out_tag =~ tr/a-z/A-Z/;
}
if ($args{tag_type} eq 'empty')
{
if ($self->{xhtml})
{
$out_tag = "<${out_tag}${inside_tag}/>";
}
else
{
$out_tag = "<${out_tag}${inside_tag}>";
}
}
else
{
push @{$self->{__tags}}, $in_tag;
$out_tag = "<${out_tag}${inside_tag}>";
}
}
$out_tag = $tag_prefix . $out_tag if $tag_prefix;
if ($self->{dict_debug} & 8)
{
print STDERR
"open_tag = '${open_tag}', in_tag = '${in_tag}', tag_type = ",
$args{tag_type},
", inside_tag = '${inside_tag}', out_tag = '$out_tag'\n";
}
return $out_tag;
} # get_tag
# close the open tag
sub close_tag ($$)
{
my $self = shift;
my $in_tag = shift;
my $open_tag = pop @{$self->{__tags}};
$in_tag ||= $open_tag;
# put the open tag back on the stack if the in-tag is not the same
if (defined $open_tag && $open_tag ne $in_tag)
{
push @{$self->{__tags}}, $open_tag;
}
my $out_tag = $in_tag;
if ($self->{lower_case_tags})
{
$out_tag =~ tr/A-Z/a-z/;
}
else # upper case
{
$out_tag =~ tr/a-z/A-Z/;
}
$out_tag = "<\/${out_tag}>";
if ($self->{dict_debug} & 8)
{
print STDERR
"close_tag: open_tag = '${open_tag}', in_tag = '${in_tag}', out_tag = '$out_tag'\n";
}
return $out_tag;
}
sub hrule ($%)
{
my $self = shift;
my %args = (
para_lines_ref => undef,
para_action_ref => undef,
ind => 0,
@_
);
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $ind = $args{ind};
my $hrmin = $self->{hrule_min};
if ($para_lines_ref->[$ind] =~ /^\s*([-_~=\*]\s*){$hrmin,}$/)
{
my $tag = $self->get_tag("HR", tag_type => 'empty');
$para_lines_ref->[$ind] = "$tag\n";
$para_action_ref->[$ind] |= $HRULE;
}
elsif ($para_lines_ref->[$ind] =~ /\014/)
{
# Linefeeds become horizontal rules
$para_action_ref->[$ind] |= $HRULE;
my $tag = $self->get_tag("HR", tag_type => 'empty');
$para_lines_ref->[$ind] =~ s/\014/\n${tag}\n/g;
}
}
sub shortline ($%)
{
my $self = shift;
my %args = (
line_ref => undef,
line_action_ref => undef,
prev_ref => undef,
prev_action_ref => undef,
prev_line_len => 0,
@_
);
my $mode_ref = $args{mode_ref};
my $line_ref = $args{line_ref};
my $line_action_ref = $args{line_action_ref};
my $prev_ref = $args{prev_ref};
my $prev_action_ref = $args{prev_action_ref};
my $prev_line_len = $args{prev_line_len};
# Short lines should be broken even on list item lines iff the
# following line is more text. I haven't figured out how to do
# that yet. For now, I'll just not break on short lines in lists.
# (sorry)
my $tag = $self->get_tag('BR', tag_type => 'empty');
if (
${$line_ref} !~ /^\s*$/
&& ${$prev_ref} !~ /^\s*$/
&& ($prev_line_len < $self->{short_line_length})
&& !(
${$line_action_ref} &
($END | $HEADER | $HRULE | $LIST | $IND_BREAK | $PAR)
)
&& !(${$prev_action_ref} & ($HEADER | $HRULE | $BREAK | $IND_BREAK))
)
{
${$prev_ref} .= $tag . chop(${$prev_ref});
${$prev_action_ref} |= $BREAK;
}
}
sub is_mailheader ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
@_
);
my $rows_ref = $args{rows_ref};
# a mail header is assumed to be the whole
# paragraph which starts with a From , From: or Newsgroups: line
if ($rows_ref->[0] =~ /^(From:?)|(Newsgroups:) /)
{
return 1;
}
return 0;
} # is_mailheader
sub mailheader ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
@_
);
my $rows_ref = $args{rows_ref};
# a mail header is assumed to be the whole
# paragraph which starts with a From: or Newsgroups: line
my $tag = '';
my @rows = @{$rows_ref};
if ($self->is_mailheader(%args))
{
$self->{__mode} |= $MAILHEADER;
if ($self->{escape_HTML_chars})
{
$rows[0] = escape($rows[0]);
}
$self->anchor_mail(\$rows[0]);
chomp ${rows}[0];
$tag = $self->get_tag('P', inside_tag => " class='mail_header'");
my $tag2 = $self->get_tag('BR', tag_type => 'empty');
$rows[0] =
join('', "\n", $tag, $rows[0], $tag2, "\n");
# now put breaks on the rest of the paragraph
# apart from the last line
for (my $rn = 1; $rn < @rows; $rn++)
{
if ($self->{escape_HTML_chars})
{
$rows[$rn] = escape($rows[$rn]);
}
if ($rn != (@rows - 1))
{
$tag = $self->get_tag('BR', tag_type => 'empty');
chomp $rows[$rn];
$rows[$rn] =~ s/$/${tag}\n/;
}
}
}
@{$rows_ref} = @rows;
} # mailheader
sub mailquote ($%)
{
my $self = shift;
my %args = (
line_ref => undef,
line_action_ref => undef,
prev_ref => undef,
prev_action_ref => undef,
next_ref => undef,
@_
);
my $line_ref = $args{line_ref};
my $line_action_ref = $args{line_action_ref};
my $prev_ref = $args{prev_ref};
my $prev_action_ref = $args{prev_action_ref};
my $next_ref = $args{next_ref};
my $tag = '';
if (
(
(${$line_ref} =~ /^\w*>/) # Handle "FF> Werewolves."
|| (${$line_ref} =~ /^[\|:]/)
) # Handle "[|:] There wolves."
&& defined($next_ref) && (${$next_ref} !~ /^\s*$/)
)
{
$tag = $self->get_tag('BR', tag_type => 'empty');
${$line_ref} =~ s/$/${tag}/;
${$line_action_ref} |= ($BREAK | $MAILQUOTE);
if (!(${$prev_action_ref} & ($BREAK | $MAILQUOTE)))
{
$tag = $self->get_tag('P', inside_tag => " class='quote_mail'");
${$prev_ref} .= $tag;
${$line_action_ref} |= $PAR;
}
}
}
# Subtracts modes listed in $mask from $vector.
sub subtract_modes ($$)
{
my ($vector, $mask) = @_;
return ($vector | $mask) - $mask;
}
sub paragraph ($%)
{
my $self = shift;
my %args = (
line_ref => undef,
line_action_ref => undef,
prev_ref => undef,
prev_action_ref => undef,
line_indent => 0,
prev_indent => 0,
is_fragment => 0,
ind => 0,
@_
);
my $line_ref = $args{line_ref};
my $line_action_ref = $args{line_action_ref};
my $prev_ref = $args{prev_ref};
my $prev_action_ref = $args{prev_action_ref};
my $line_indent = $args{line_indent};
my $prev_indent = $args{prev_indent};
my $is_fragment = $args{is_fragment};
my $line_no = $args{ind};
my $tag = '';
if (
${$line_ref} !~ /^\s*$/
&& !subtract_modes(
${$line_action_ref}, $END | $MAILQUOTE | $CAPS | $BREAK
)
&& ( ${$prev_ref} =~ /^\s*$/
|| (${$line_action_ref} & $END)
|| ($line_indent > $prev_indent + $self->{par_indent}))
&& !($is_fragment && $line_no == 0)
)
{
if ( $self->{indent_par_break}
&& ${$prev_ref} !~ /^\s*$/
&& !(${$line_action_ref} & $END)
&& ($line_indent > $prev_indent + $self->{par_indent}))
{
$tag = $self->get_tag('BR', tag_type => 'empty');
${$prev_ref} .= $tag;
${$prev_ref} .= " " x $line_indent;
${$line_ref} =~ s/^ {$line_indent}//;
${$prev_action_ref} |= $BREAK;
${$line_action_ref} |= $IND_BREAK;
}
elsif ($self->{preserve_indent})
{
$tag = $self->get_tag('P');
${$prev_ref} .= $tag;
${$prev_ref} .= " " x $line_indent;
${$line_ref} =~ s/^ {$line_indent}//;
${$line_action_ref} |= $PAR;
}
else
{
$tag = $self->get_tag('P');
${$prev_ref} .= $tag;
${$line_action_ref} |= $PAR;
}
}
# detect also a continuing indentation at the same level
elsif ($self->{indent_par_break}
&& !($self->{__mode} & ($PRE | $TABLE | $LIST))
&& ${$prev_ref} !~ /^\s*$/
&& !(${$line_action_ref} & $END)
&& (${$prev_action_ref} & ($IND_BREAK | $PAR))
&& !subtract_modes(${$line_action_ref}, $END | $MAILQUOTE | $CAPS)
&& ($line_indent > $self->{par_indent})
&& ($line_indent == $prev_indent))
{
$tag = $self->get_tag('BR', tag_type => 'empty');
${$prev_ref} .= $tag;
${$prev_ref} .= " " x $line_indent;
${$line_ref} =~ s/^ {$line_indent}//;
${$prev_action_ref} |= $BREAK;
${$line_action_ref} |= $IND_BREAK;
}
}
sub listprefix ($$)
{
my $self = shift;
my $line = shift;
my ($prefix, $number, $rawprefix, $term);
my $bullets = $self->{bullets};
my $bullets_ordered = $self->{bullets_ordered};
my $number_match = '(\d+|[^\W\d])';
if ($bullets_ordered)
{
$number_match = '(\d+|[a-zA-Z]|[' . "${bullets_ordered}])";
}
$self->{__number_match} = $number_match;
my $term_match = '(\w\w+)';
$self->{__term_match} = $term_match;
return (0, 0, 0, 0)
if ( !($line =~ /^\s*[${bullets}]\s+\S/)
&& !($line =~ /^\s*${number_match}[\.\)\]:]\s+\S/)
&& !($line =~ /^\s*${term_match}:$/));
($term) = $line =~ /^\s*${term_match}:$/;
($number) = $line =~ /^\s*${number_match}\S\s+\S/;
$number = 0 unless defined($number);
if ( $bullets_ordered
&& $number =~ /[${bullets_ordered}]/)
{
$number = 1;
}
# That slippery exception of "o" as a bullet
# (This ought to be determined using the context of what lists
# we have in progress, but this will probably work well enough.)
if ($bullets =~ /o/ && $line =~ /^\s*o\s/)
{
$number = 0;
}
if ($term)
{
($rawprefix) = $line =~ /^(\s*${term_match}.)$/;
$prefix = $rawprefix;
$prefix =~ s/${term_match}//; # Take the term out
}
elsif ($number)
{
($rawprefix) = $line =~ /^(\s*${number_match}.)/;
$prefix = $rawprefix;
$prefix =~ s/${number_match}//; # Take the number out
}
else
{
($rawprefix) = $line =~ /^(\s*[${bullets}].)/;
$prefix = $rawprefix;
}
($prefix, $number, $rawprefix, $term);
} # listprefix
sub startlist ($%)
{
my $self = shift;
my %args = (
prefix => '',
number => 0,
rawprefix => '',
term => '',
para_lines_ref => undef,
para_action_ref => undef,
ind => 0,
prev_ref => undef,
total_prefix => '',
@_
);
my $prefix = $args{prefix};
my $number = $args{number};
my $rawprefix = $args{rawprefix};
my $term = $args{term};
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $ind = $args{ind};
my $prev_ref = $args{prev_ref};
my $tag = '';
$self->{__listprefix}->[$self->{__listnum}] = $prefix;
if ($number)
{
# It doesn't start with 1,a,A. Let's not screw with it.
if (($number ne "1") && ($number ne "a") && ($number ne "A"))
{
return 0;
}
$tag = $self->get_tag('OL');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
$self->{__list}->[$self->{__listnum}] = $OL;
}
elsif ($term)
{
$tag = $self->get_tag('DL');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
$self->{__list}->[$self->{__listnum}] = $DL;
}
else
{
$tag = $self->get_tag('UL');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
$self->{__list}->[$self->{__listnum}] = $UL;
}
$self->{__list_indent}->[$self->{__listnum}] = length($args{total_prefix});
$self->{__listnum}++;
$self->{__list_nice_indent} =
" " x $self->{__listnum} x $self->{indent_width};
$para_action_ref->[$ind] |= $LIST;
$para_action_ref->[$ind] |= $LIST_START;
$self->{__mode} |= $LIST;
1;
} # startlist
# End N lists
sub endlist ($%)
{
my $self = shift;
my %args = (
num_lists => 0,
prev_ref => undef,
line_action_ref => undef,
@_
);
my $n = $args{num_lists};
my $prev_ref = $args{prev_ref};
my $line_action_ref = $args{line_action_ref};
my $tag = '';
for (; $n > 0; $n--, $self->{__listnum}--)
{
$self->{__list_nice_indent} =
" " x ($self->{__listnum} - 1) x $self->{indent_width};
if ($self->{__list}->[$self->{__listnum} - 1] == $UL)
{
$tag = $self->get_tag('UL', tag_type => 'end');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
pop @{$self->{__list_indent}};
}
elsif ($self->{__list}->[$self->{__listnum} - 1] == $OL)
{
$tag = $self->get_tag('OL', tag_type => 'end');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
pop @{$self->{__list_indent}};
}
elsif ($self->{__list}->[$self->{__listnum} - 1] == $DL)
{
$tag = $self->get_tag('DL', tag_type => 'end');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
pop @{$self->{__list_indent}};
}
else
{
print STDERR "Encountered list of unknown type\n";
}
}
${$line_action_ref} |= $END;
$self->{__mode} ^= $LIST if (!$self->{__listnum});
} # endlist
sub continuelist ($%)
{
my $self = shift;
my %args = (
para_lines_ref => undef,
para_action_ref => undef,
ind => 0,
term => '',
@_
);
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $ind = $args{ind};
my $term = $args{term};
my $list_indent = $self->{__list_nice_indent};
my $bullets = $self->{bullets};
my $num_match = $self->{__number_match};
my $term_match = $self->{__term_match};
my $tag = '';
if ( $self->{__list}->[$self->{__listnum} - 1] == $UL
&& $para_lines_ref->[$ind] =~ /^\s*[${bullets}]\s*/)
{
$tag = $self->get_tag('LI');
$para_lines_ref->[$ind] =~ s/^\s*[${bullets}]\s*/${list_indent}${tag}/;
$para_action_ref->[$ind] |= $LIST_ITEM;
}
if ($self->{__list}->[$self->{__listnum} - 1] == $OL)
{
$tag = $self->get_tag('LI');
$para_lines_ref->[$ind] =~ s/^\s*${num_match}.\s*/${list_indent}${tag}/;
$para_action_ref->[$ind] |= $LIST_ITEM;
}
if ( $self->{__list}->[$self->{__listnum} - 1] == $DL
&& $term)
{
$tag = $self->get_tag('DT');
my $tag2 = $self->get_tag('DT', tag_type => 'end');
$term =~ s/_/ /g; # underscores are now spaces in the term
$para_lines_ref->[$ind] =~
s/^\s*${term_match}.$/${list_indent}${tag}${term}${tag2}/;
$tag = $self->get_tag('DD');
$para_lines_ref->[$ind] .= ${tag};
$para_action_ref->[$ind] |= $LIST_ITEM;
}
$para_action_ref->[$ind] |= $LIST;
} # continuelist
sub liststuff ($%)
{
my $self = shift;
my %args = (
para_lines_ref => undef,
para_action_ref => undef,
para_line_indent_ref => undef,
ind => 0,
prev_ref => undef,
@_
);
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $para_line_indent_ref = $args{para_line_indent_ref};
my $ind = $args{ind};
my $prev_ref = $args{prev_ref};
my $i;
my ($prefix, $number, $rawprefix, $term) =
$self->listprefix($para_lines_ref->[$ind]);
if (!$prefix)
{
# if the previous line is not blank
if ($ind > 0 && $para_lines_ref->[$ind - 1] !~ /^\s*$/)
{
# inside a list item
return;
}
# This might be a new paragraph within an existing list item;
# It will be the first line, and have the same indentation
# as the list's indentation.
if ( $ind == 0
&& $self->{__listnum}
&& $para_line_indent_ref->[$ind] ==
$self->{__list_indent}->[$self->{__listnum} - 1])
{
# start a paragraph
my $tag = $self->get_tag('P');
${$prev_ref} .= $tag;
$para_action_ref->[$ind] |= $PAR;
return;
}
# This ain't no list. We'll want to end all of them.
if ($self->{__listnum})
{
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => $prev_ref,
line_action_ref => \$para_action_ref->[$ind]
);
}
return;
}
# If numbers with more than one digit grow to the left instead of
# to the right, the prefix will shrink and we'll fail to match the
# right list. We need to account for this.
my $prefix_alternate;
if (length("" . $number) > 1)
{
$prefix_alternate = (" " x (length("" . $number) - 1)) . $prefix;
}
# Maybe we're going back up to a previous list
for (
$i = $self->{__listnum} - 1;
($i >= 0) && ($prefix ne $self->{__listprefix}->[$i]);
$i--
)
{
if (length("" . $number) > 1)
{
last if $prefix_alternate eq $self->{__listprefix}->[$i];
}
}
my $islist;
# Measure the indent from where the text starts, not where the
# prefix starts. This won't screw anything up, and if we don't do
# it, the next line might appear to be indented relative to this
# line, and get tagged as a new paragraph.
my $bullets = $self->{bullets};
my $bullets_ordered = $self->{bullets_ordered};
my $term_match = $self->{__term_match};
my ($total_prefix) =
$para_lines_ref->[$ind] =~ /^(\s*[${bullets}${bullets_ordered}\w]+.\s*)/;
# a DL indent starts from the edge of the term, plus indent_width
if ($term)
{
($total_prefix) = $para_lines_ref->[$ind] =~ /^(\s*)${term_match}.$/;
$total_prefix .= " " x $self->{indent_width};
}
# Of course, we only use it if it really turns out to be a list.
$islist = 1;
$i++;
if (($i > 0) && ($i != $self->{__listnum}))
{
$self->endlist(
num_lists => $self->{__listnum} - $i,
prev_ref => $prev_ref,
line_action_ref => \$para_action_ref->[$ind]
);
$islist = 0;
}
elsif (!$self->{__listnum} || ($i != $self->{__listnum}))
{
if (
($para_line_indent_ref->[$ind] > 0)
|| $ind == 0
|| ($ind > 0 && ($para_lines_ref->[$ind - 1] =~ /^\s*$/))
|| ( $ind > 0
&& $para_action_ref->[$ind - 1] & ($BREAK | $HEADER | $CAPS))
)
{
$islist = $self->startlist(
prefix => $prefix,
number => $number,
rawprefix => $rawprefix,
term => $term,
para_lines_ref => $para_lines_ref,
para_action_ref => $para_action_ref,
ind => $ind,
prev_ref => $prev_ref,
total_prefix => $total_prefix
);
}
else
{
# We have something like this: "- foo" which usually
# turns out not to be a list.
return;
}
}
$self->continuelist(
para_lines_ref => $para_lines_ref,
para_action_ref => $para_action_ref,
ind => $ind,
term => $term
)
if ($self->{__mode} & $LIST);
$para_line_indent_ref->[$ind] = length($total_prefix) if $islist;
} # liststuff
# figure out the table type of this table, if any
sub get_table_type ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $table_type = 0;
if ( $self->{table_type}->{DELIM}
&& $self->is_delim_table(%args))
{
$table_type = $TAB_DELIM;
}
elsif ($self->{table_type}->{ALIGN}
&& $self->is_aligned_table(%args))
{
$table_type = $TAB_ALIGN;
}
elsif ($self->{table_type}->{PGSQL}
&& $self->is_pgsql_table(%args))
{
$table_type = $TAB_PGSQL;
}
elsif ($self->{table_type}->{BORDER}
&& $self->is_border_table(%args))
{
$table_type = $TAB_BORDER;
}
return $table_type;
}
# check if the given paragraph-array is an aligned table
sub is_aligned_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# TABLES: spot and mark up tables. We combine the lines of the
# paragraph using the string bitwise or (|) operator, the result
# being in $spaces. A character in $spaces is a space only if
# there was a space at that position in every line of the
# paragraph. $space can be used to search for contiguous spaces
# that occur on all lines of the paragraph. If this results in at
# least two columns, the paragraph is identified as a table.
# Note that this sub must be called before checking for preformatted
# lines because a table may well have whitespace to the left, in
# which case it must not be incorrectly recognised as a preformat.
my @rows = @{$rows_ref};
my @starts;
my $spaces = '';
my $max = 0;
my $min = $para_len;
foreach my $row (@rows)
{
($spaces |= $row) =~ tr/ /\xff/c;
$min = length $row if length $row < $min;
$max = length $row if $max < length $row;
}
$spaces = substr $spaces, 0, $min;
push(@starts, 0) unless $spaces =~ /^ /;
while ($spaces =~ /((?:^| ) +)(?=[^ ])/g)
{
push @starts, pos($spaces);
}
if (2 <= @rows and 2 <= @starts)
{
return 1;
}
else
{
return 0;
}
}
sub is_pgsql_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# a PGSQL table can start with an optional table-caption,
# then it has a row of column headings separated by |
# then it has a row of ------+-----
# then it has one or more rows of column values separated by |
# then it has a row-count (N rows)
# Thus it must have at least 4 rows.
if (@{$rows_ref} < 4)
{
return 0;
}
my @rows = @{$rows_ref};
if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
{
shift @rows;
}
if (@rows < 4)
{
return 0;
}
if ($rows[0] !~ /^\s*\w+\s+\|\s+/) # Colname |
{
return 0;
}
if ($rows[1] !~ /^\s*[-]+[+][-]+/) # ----+----
{
return 0;
}
if ($rows[2] !~ /^\s*[^|]*\s+\|\s+/) # value |
{
return 0;
}
# check the last row for rowcount
if ($rows[$#rows] !~ /\(\d+\s+rows\)/)
{
return 0;
}
return 1;
}
sub is_border_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# a BORDER table can start with an optional table-caption,
# then it has a row of +------+-----+
# then it has a row of column headings separated by |
# then it has a row of +------+-----+
# then it has one or more rows of column values separated by |
# then it has a row of +------+-----+
# Thus it must have at least 5 rows.
# And note that it could be indented with spaces
if (@{$rows_ref} < 5)
{
return 0;
}
my @rows = @{$rows_ref};
if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
{
shift @rows;
}
if (@rows < 5)
{
return 0;
}
if ($rows[0] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
{
return 0;
}
if ($rows[1] !~ /^\s*\|\s*\w+\s+\|\s+.*\|$/) # | Colname |
{
return 0;
}
if ($rows[2] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
{
return 0;
}
if ($rows[3] !~ /^\s*\|\s*[^|]*\s+\|\s+.*\|$/) # | value |
{
return 0;
}
# check the last row for +------+------+
if ($rows[$#rows] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
{
return 0;
}
return 1;
} # is_border_table
sub is_delim_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# a DELIM table can start with an optional table-caption,
# then it has at least two rows which start and end and are
# punctuated by a non-alphanumeric delimiter.
#
# | val1 | val2 |
# | val3 | val4 |
#
# And note that it could be indented with spaces
if (@{$rows_ref} < 2)
{
return 0;
}
my @rows = @{$rows_ref};
if ($rows[0] !~ /[^\w\s]/ && $rows[0] =~ /^\s*\w+/) # possible caption
{
shift @rows;
}
if (@rows < 2)
{
return 0;
}
# figure out if the row starts with a possible delimiter
my $delim = '';
if ($rows[0] =~ /^\s*([^a-zA-Z0-9])/)
{
$delim = $1;
# have to get rid of ^ and []
$delim =~ s/\^//g;
$delim =~ s/\[//g;
$delim =~ s/\]//g;
if (!$delim) # no delimiter after all
{
return 0;
}
}
else
{
return 0;
}
# There needs to be at least three delimiters in the row
my @all_delims = ($rows[0] =~ /[${delim}]/g);
my $total_num_delims = @all_delims;
if ($total_num_delims < 3)
{
return 0;
}
# All rows must start and end with the delimiter
# and have $total_num_delims number of them
foreach my $row (@rows)
{
if ($row !~ /^\s*[${delim}]/)
{
return 0;
}
if ($row !~ /[${delim}]\s*$/)
{
return 0;
}
@all_delims = ($row =~ /[${delim}]/g);
if (@all_delims != $total_num_delims)
{
return 0;
}
}
return 1;
} # is_delim_table
sub tablestuff ($%)
{
my $self = shift;
my %args = (
table_type => 0,
rows_ref => undef,
para_len => 0,
@_
);
my $table_type = $args{table_type};
if ($table_type eq $TAB_ALIGN)
{
return $self->make_aligned_table(%args);
}
if ($table_type eq $TAB_PGSQL)
{
return $self->make_pgsql_table(%args);
}
if ($table_type eq $TAB_BORDER)
{
return $self->make_border_table(%args);
}
if ($table_type eq $TAB_DELIM)
{
return $self->make_delim_table(%args);
}
} # tablestuff
sub make_aligned_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# TABLES: spot and mark up tables. We combine the lines of the
# paragraph using the string bitwise or (|) operator, the result
# being in $spaces. A character in $spaces is a space only if
# there was a space at that position in every line of the
# paragraph. $space can be used to search for contiguous spaces
# that occur on all lines of the paragraph. If this results in at
# least two columns, the paragraph is identified as a table.
# Note that this sub must be called before checking for preformatted
# lines because a table may well have whitespace to the left, in
# which case it must not be incorrectly recognised as a preformat.
my @rows = @{$rows_ref};
my @starts;
my @ends;
my $spaces;
my $max = 0;
my $min = $para_len;
foreach my $row (@rows)
{
($spaces |= $row) =~ tr/ /\xff/c;
$min = length $row if length $row < $min;
$max = length $row if $max < length $row;
}
$spaces = substr $spaces, 0, $min;
push(@starts, 0) unless $spaces =~ /^ /;
while ($spaces =~ /((?:^| ) +)(?=[^ ])/g)
{
push @ends, pos($spaces) - length $1;
push @starts, pos($spaces);
}
shift(@ends) if $spaces =~ /^ /;
push(@ends, $max);
# Two or more rows and two or more columns indicate a table.
if (2 <= @rows and 2 <= @starts)
{
$self->{__mode} |= $TABLE;
# For each column, guess whether it should be left, centre or
# right aligned by examining all cells in that column for space
# to the left or the right. A simple majority among those cells
# that actually have space to one side or another decides (if no
# alignment gets a majority, left alignment wins by default).
my @align;
my $cell = '';
foreach my $col (0 .. $#starts)
{
my @count = (0, 0, 0, 0);
foreach my $row (@rows)
{
my $width = $ends[$col] - $starts[$col];
$cell = substr $row, $starts[$col], $width;
++$count[($cell =~ /^ / ? 2 : 0) +
($cell =~ / $/ || length($cell) < $width ? 1 : 0)];
}
$align[$col] = 0;
my $population = $count[1] + $count[2] + $count[3];
foreach (1 .. 3)
{
if ($count[$_] * 2 > $population)
{
$align[$col] = $_;
last;
}
}
}
foreach my $row (@rows)
{
$row = join '', $self->get_tag('TR'), (
map {
$cell = substr $row, $starts[$_], $ends[$_] - $starts[$_];
$cell =~ s/^ +//;
$cell =~ s/ +$//;
if ($self->{escape_HTML_chars})
{
$cell = escape($cell);
}
(
$self->get_tag(
'TD',
inside_tag => (
$self->{xhtml} ? $xhtml_alignments[$align[$_]]
: (
$self->{lower_case_tags}
? $lc_alignments[$align[$_]]
: $alignments[$align[$_]]
)
)
),
$cell,
$self->close_tag('TD')
);
} 0 .. $#starts
),
$self->close_tag('TR');
}
# put the