summaryrefslogtreecommitdiff
path: root/lib/HTML
diff options
context:
space:
mode:
Diffstat (limited to 'lib/HTML')
-rw-r--r--lib/HTML/TextToHTML.pm5266
1 files changed, 5266 insertions, 0 deletions
diff --git a/lib/HTML/TextToHTML.pm b/lib/HTML/TextToHTML.pm
new file mode 100644
index 0000000..82f3dc2
--- /dev/null
+++ b/lib/HTML/TextToHTML.pm
@@ -0,0 +1,5266 @@
+package HTML::TextToHTML;
+use 5.006_001;
+use strict;
+#------------------------------------------------------------------------
+
+=head1 NAME
+
+HTML::TextToHTML - convert plain text file to HTML.
+
+=head1 VERSION
+
+This describes version B<2.42> of HTML::TextToHTML.
+
+=cut
+
+our $VERSION = '2.42';
+
+=head1 SYNOPSIS
+
+ From the command line:
+
+ txt2html I<arguments>
+
+ From Scripts:
+
+ use HTML::TextToHTML;
+
+ # create a new object
+ my $conv = new HTML::TextToHTML();
+
+ # convert a file
+ $conv->txt2html(infile=>[$text_file],
+ outfile=>$html_file,
+ title=>"Wonderful Things",
+ mail=>1,
+ ]);
+
+ # reset arguments
+ $conv->args(infile=>[], mail=>0);
+
+ # convert a string
+ $newstring = $conv->process_chunk($mystring)
+
+=head1 DESCRIPTION
+
+HTML::TextToHTML converts plain text files to HTML. The txt2html script
+uses this module to do the same from the command-line.
+
+It supports headings, tables, lists, simple character markup, and
+hyperlinking, and is highly customizable. It recognizes some of the
+apparent structure of the source document (mostly whitespace and
+typographic layout), and attempts to mark that structure explicitly
+using HTML. The purpose for this tool is to provide an easier way of
+converting existing text documents to HTML format, giving something nicer
+than just whapping the text into a big PRE block.
+
+=head2 History
+
+The original txt2html script was written by Seth Golub (see
+http://www.aigeek.com/txt2html/), and converted to a perl module by
+Kathryn Andersen (see http://www.katspace.com/tools/text_to_html/) and
+made into a sourceforge project by Sun Tong (see
+http://sourceforge.net/projects/txt2html/). Earlier versions of the
+HTML::TextToHTML module called the included script texthyper so as not
+to clash with the original txt2html script, but now the projects have
+all been merged.
+
+=head1 REQUIRES
+
+HTML::TextToHTML requires Perl 5.6.1 or later.
+
+For installation, it needs:
+
+ Module::Build
+
+The txt2html script needs:
+
+ Getopt::Long
+ Getopt::ArgvFile
+ Pod::Usage
+ File::Basename
+
+For testing, it also needs:
+
+ Test::More
+
+For debugging, it also needs:
+
+ Data::Dumper
+
+=head1 INSTALLATION
+
+Make sure you have the dependencies installed first!
+(see REQUIRES above)
+
+Some of those modules come standard with more recent versions of perl,
+but I thought I'd mention them anyway, just in case you may not have
+them.
+
+If you don't know how to install these, try using the CPAN module, an
+easy way of auto-installing modules from the Comprehensive Perl Archive
+Network, where the above modules reside.
+Do "perldoc perlmodinstall" or "perldoc CPAN" for more information.
+
+To install this module type the following:
+
+ perl Build.PL
+ ./Build
+ ./Build test
+ ./Build install
+
+Or, if you're on a platform (like DOS or Windows) that doesn't like the
+"./" notation, you can do this:
+
+ perl Build.PL
+ perl Build
+ perl Build test
+ perl Build install
+
+In order to install somewhere other than the default, such as
+in a directory under your home directory, like "/home/fred/perl"
+go
+
+ perl Build.PL --install_base /home/fred/perl
+
+as the first step instead.
+
+This will install the files underneath /home/fred/perl.
+
+You will then need to make sure that you alter the PERL5LIB variable to
+find the modules, and the PATH variable to find the script.
+
+Therefore you will need to change:
+your path, to include /home/fred/perl/script (where the script will be)
+
+ PATH=/home/fred/perl/script:${PATH}
+
+the PERL5LIB variable to add /home/fred/perl/lib
+
+ PERL5LIB=/home/fred/perl/lib:${PERL5LIB}
+
+Note that the system links dictionary will be installed as
+"/home/fred/perl/share/txt2html/txt2html.dict"
+
+If you want to install in a temporary install directory (such as
+if you are building a package) then instead of going
+
+ perl Build install
+
+go
+
+ perl Build install destdir=/my/temp/dir
+
+and it will be installed there, with a directory structure under
+/my/temp/dir the same as it would be if it were installed plain.
+Note that this is NOT the same as setting --install_base, because
+certain things are done at build-time which use the install_base info.
+
+See "perldoc perlrun" for more information on PERL5LIB, and
+see "perldoc Module::Build" for more information on
+installation options.
+
+=head1 OPTIONS
+
+All arguments can be set when the object is created, and further options
+can be set when calling the actual txt2html method. Arguments
+to methods can take either a hash of arguments, or a reference to an
+array. Note that the reference-to-array method is depricated and is only
+retained for backwards compatibility.
+
+Note that all option-names must match exactly -- no abbreviations are
+allowed.
+
+The arguments get treated differently depending on whether they are
+given in a hash or a reference to an array. When the arguments are
+in a hash, the argument-keys are expected to have values matching
+those required for that argument -- whether that be a boolean, a string,
+a reference to an array or a reference to a hash. These will replace
+any value for that argument that might have been there before.
+
+When the arguments are in a reference to an array, it is treated
+somewhat as if it were a command-line: option names are expected to
+start with '--' or '-', boolean options are set to true as soon as the
+option is given (no value is expected to follow), boolean options with
+the word "no" prepended set the option to false, string options are
+expected to have a string value following, and those options which are
+internally arrays or hashes are treated as cumulative; that is, the
+value following the --option is added to the current set for that
+option, to add more, one just repeats the --option with the next value,
+and in order to reset that option to empty, the special value of "CLEAR"
+must be added to the list.
+
+NOTE: the reference-to-an-array usage is DEPRECATED and will be removed
+in the future.
+
+=over
+
+=item append_file
+
+ append_file=>I<filename>
+
+If you want something appended by default, put the filename here.
+The appended text will not be processed at all, so make sure it's
+plain text or decent HTML. i.e. do not have things like:
+ Mary Andersen E<lt>kitty@example.comE<gt>
+but instead, have:
+ Mary Andersen &lt;kitty@example.com&gt;
+
+(default: nothing)
+
+=item append_head
+
+ append_head=>I<filename>
+
+If you want something appended to the head by default, put the filename here.
+The appended text will not be processed at all, so make sure it's
+plain text or decent HTML. i.e. do not have things like:
+ Mary Andersen E<lt>kitty@example.comE<gt>
+but instead, have:
+ Mary Andersen &lt;kitty@example.com&gt;
+
+(default: nothing)
+
+=item body_deco
+
+ body_deco=>I<string>
+
+Body decoration string: a string to be added to the BODY tag so that
+one can set attributes to the BODY (such as class, style, bgcolor etc)
+For example, "class='withimage'".
+
+=item bold_delimiter
+
+ bold_delimiter=>I<string>
+
+This defines what character (or string) is taken to be the delimiter of
+text which is to be interpreted as bold (that is, to be given a STRONG
+tag). If this is empty, then no bolding of text will be done.
+(default: #)
+
+=item bullets
+
+ bullets=>I<string>
+
+This defines what single characters are taken to be "bullet" characters
+for unordered lists. Note that because this is used as a character
+class, if you use '-' it must come first.
+(default:-=o*\267)
+
+=item bullets_ordered
+
+ bullets_ordered=>I<string>
+
+This defines what single characters are taken to be "bullet" placeholder
+characters for ordered lists. Ordered lists are normally marked by
+a number or letter followed by '.' or ')' or ']' or ':'. If an ordered
+bullet is used, then it simply indicates that this is an ordered list,
+without giving explicit numbers.
+
+Note that because this is used as a character class, if you use '-' it
+must come first.
+(default:nothing)
+
+=item caps_tag
+
+ caps_tag=>I<tag>
+
+Tag to put around all-caps lines
+(default: STRONG)
+If an empty tag is given, then no tag will be put around all-caps lines.
+
+=item custom_heading_regexp
+
+ custom_heading_regexp=>I<regexp>
+
+Add a regexp for headings. Header levels are assigned by regexp
+in order seen When a line matches a custom header regexp, it is tagged as
+a header. If it's the first time that particular regexp has matched,
+the next available header level is associated with it and applied to
+the line. Any later matches of that regexp will use the same header level.
+Therefore, if you want to match numbered header lines, you could use
+something like this:
+
+ -H '^ *\d+\. \w+' -H '^ *\d+\.\d+\. \w+' -H '^ *\d+\.\d+\.\d+\. \w+'
+
+Then lines like
+
+ " 1. Examples "
+ " 1.1. Things"
+ and " 4.2.5. Cold Fusion"
+
+Would be marked as H1, H2, and H3 (assuming they were found in that
+order, and that no other header styles were encountered).
+If you prefer that the first one specified always be H1, the second
+always be H2, the third H3, etc, then use the -EH/--explicit-headings
+option.
+
+This is a multi-valued option.
+
+(default: none)
+
+=item debug
+
+ debug=>1
+
+Enable copious script debugging output (don't bother, this is for the
+developer) (default: false)
+
+=item default_link_dict
+
+ default_link_dict=>I<filename>
+
+The name of the default "user" link dictionary.
+(default: "$ENV{'HOME'}/.txt2html.dict" -- this is the same as for
+the txt2html script. If there is no $ENV{HOME} then it is just '.txt2html.dict')
+
+=item demoronize
+
+ demoronize=>1
+
+Convert Microsoft-generated character codes that are non-ISO codes into
+something more reasonable.
+(default:true)
+
+=item dict_debug
+
+ dict_debug=>I<n>
+
+Debug mode for link dictionaries Bitwise-Or what you want to see:
+ 1: The parsing of the dictionary
+ 2: The code that will make the links
+ 4: When each rule matches something
+ 8: When each tag is created
+
+(default: 0)
+
+=item doctype
+
+ doctype=>I<doctype>
+
+This gets put in the DOCTYPE field at the top of the document, unless it's
+empty. (default : "-//W3C//DTD HTML 3.2 Final//EN")
+If --xhtml is true, the contents of this is ignored, unless it's
+empty, in which case no DOCTYPE declaration is output.
+
+=item eight_bit_clean
+
+ eight_bit_clean=>1
+
+disable Latin-1 character entity naming
+(default: false)
+
+=item escape_HTML_chars
+
+ escape_HTML_chars=>1
+
+turn & E<lt> E<gt> into &amp; &gt; &lt;
+(default: true)
+
+=item explicit_headings
+
+ explicit_headings=>1
+
+Don't try to find any headings except the ones specified in the
+--custom_heading_regexp option.
+Also, the custom headings will not be assigned levels in the order they
+are encountered in the document, but in the order they are specified on
+the command line.
+(default: false)
+
+=item extract
+
+ extract=>1
+
+Extract Mode; don't put HTML headers or footers on the result, just
+the plain HTML (thus making the result suitable for inserting into
+another document (or as part of the output of a CGI script).
+(default: false)
+
+=item hrule_min
+
+ hrule_min=>I<n>
+
+Min number of ---s for an HRule.
+(default: 4)
+
+=item indent_width
+
+ indent_width=>I<n>
+
+Indents this many spaces for each level of a list.
+(default: 2)
+
+=item indent_par_break
+
+ indent_par_break=>1
+
+Treat paragraphs marked solely by indents as breaks with indents.
+That is, instead of taking a three-space indent as a new paragraph,
+put in a <BR> and three non-breaking spaces instead.
+(see also --preserve_indent)
+(default: false)
+
+=item infile
+
+ infile=>\@my_files
+ infile=>['chapter1.txt', 'chapter2.txt']
+ "--infile", "chapter1.txt", "--infile", "chapter2.txt"
+
+The name of the input file(s). When the arguments are given as a hash,
+this expects a reference to an array of filenames. When the arguments
+are given as a reference to an array, then the "--infile" option must
+be repeated for each new file added to the list. If you want to reset
+the list to be empty, give the special value of "CLEAR".
+
+The special filename '-' designates STDIN.
+
+See also L</inhandle> and L</instring>.
+
+(default:-)
+
+=item inhandle
+
+ inhandle=>\@my_handles
+ inhandle=>[\*MYINHANDLE, \*STDIN]
+
+An array of input filehandles; use this instead of
+L</infile> or L</instring> to use a filehandle or filehandles
+as input.
+
+=item instring
+
+ instring=>\@my_strings
+ instring=>[$string1, $string2]
+
+An array of input strings; use this instead of
+L</infile> or L</inhandle> to use a string or strings
+as input.
+
+=item italic_delimiter
+
+ italic_delimiter=>I<string>
+
+This defines what character (or string) is taken to be the delimiter of
+text which is to be interpreted as italic (that is, to be given a EM
+tag). If this is empty, no italicising of text will be done.
+(default: *)
+
+=item links_dictionaries
+
+ links_dictionaries=>\@my_link_dicts
+ links_dictionaries=>['url_links.dict', 'format_links.dict']
+ "--links_dictionaries", "url_links.dict", "--links_dictionaries", "format_links.dict"
+
+File(s) to use as a link-dictionary. There can be more than one of
+these. These are in addition to the Global Link Dictionary and the User
+Link Dictionary. When the arguments are given as a hash, this expects a
+reference to an array of filenames. When the arguments are given as a
+reference to an array, then the "--links_dictionaries" option must be
+repeated for each new file added to the list. If you want to reset the
+list to be empty, give the special value of "CLEAR".
+
+=item link_only
+
+ link_only=>1
+
+Do no escaping or marking up at all, except for processing the links
+dictionary file and applying it. This is useful if you want to use
+the linking feature on an HTML document. If the HTML is a
+complete document (includes HTML,HEAD,BODY tags, etc) then you'll
+probably want to use the --extract option also.
+(default: false)
+
+=item lower_case_tags
+
+ lower_case_tags=>1
+
+Force all tags to be in lower-case.
+
+=item mailmode
+
+ mailmode=>1
+
+Deal with mail headers & quoted text. The mail header paragraph is
+given the class 'mail_header', and mail-quoted text is given the class
+'quote_mail'.
+(default: false)
+
+=item make_anchors
+
+ make_anchors=>0
+
+Should we try to make anchors in headings?
+(default: true)
+
+=item make_links
+
+ make_links=>0
+
+Should we try to build links? If this is false, then the links
+dictionaries are not consulted and only structural text-to-HTML
+conversion is done. (default: true)
+
+=item make_tables
+
+ make_tables=>1
+
+Should we try to build tables? If true, spots tables and marks them up
+appropriately. See L</Input File Format> for information on how tables
+should be formatted.
+
+This overrides the detection of lists; if something looks like a table,
+it is taken as a table, and list-checking is not done for that
+paragraph.
+
+(default: false)
+
+=item min_caps_length
+
+ min_caps_length=>I<n>
+
+min sequential CAPS for an all-caps line
+(default: 3)
+
+=item outfile
+
+ outfile=>I<filename>
+
+The name of the output file. If it is "-" then the output goes
+to Standard Output.
+(default: - )
+
+=item outhandle
+
+The output filehandle; if this is given then the output goes
+to this filehandle instead of to the file given in L</outfile>.
+
+=item par_indent
+
+ par_indent=>I<n>
+
+Minumum number of spaces indented in first lines of paragraphs.
+ Only used when there's no blank line
+preceding the new paragraph.
+(default: 2)
+
+=item preformat_trigger_lines
+
+ preformat_trigger_lines=>I<n>
+
+How many lines of preformatted-looking text are needed to switch to <PRE>
+ <= 0 : Preformat entire document
+ 1 : one line triggers
+ >= 2 : two lines trigger
+
+(default: 2)
+
+=item endpreformat_trigger_lines
+
+ endpreformat_trigger_lines=>I<n>
+
+How many lines of unpreformatted-looking text are needed to switch from <PRE>
+ <= 0 : Never preformat within document
+ 1 : one line triggers
+ >= 2 : two lines trigger
+(default: 2)
+
+NOTE for preformat_trigger_lines and endpreformat_trigger_lines:
+A zero takes precedence. If one is zero, the other is ignored.
+If both are zero, entire document is preformatted.
+
+=item preformat_start_marker
+
+ preformat_start_marker=>I<regexp>
+
+What flags the start of a preformatted section if --use_preformat_marker
+is true.
+
+(default: "^(:?(:?&lt;)|<)PRE(:?(:?&gt;)|>)\$")
+
+=item preformat_end_marker
+
+ preformat_end_marker=>I<regexp>
+
+What flags the end of a preformatted section if --use_preformat_marker
+is true.
+
+(default: "^(:?(:?&lt;)|<)/PRE(:?(:?&gt;)|>)\$")
+
+=item preformat_whitespace_min
+
+ preformat_whitespace_min=>I<n>
+
+Minimum number of consecutive whitespace characters to trigger
+normal preformatting.
+NOTE: Tabs are expanded to spaces before this check is made.
+That means if B<tab_width> is 8 and this is 5, then one tab may be
+expanded to 8 spaces, which is enough to trigger preformatting.
+(default: 5)
+
+=item prepend_file
+
+ prepend_file=>I<filename>
+
+If you want something prepended to the processed body text, put the
+filename here. The prepended text will not be processed at all, so make
+sure it's plain text or decent HTML.
+
+(default: nothing)
+
+=item preserve_indent
+
+ preserve_indent=>1
+
+Preserve the first-line indentation of paragraphs marked with indents
+by replacing the spaces of the first line with non-breaking spaces.
+(default: false)
+
+=item short_line_length
+
+ short_line_length=>I<n>
+
+Lines this short (or shorter) must be intentionally broken and are kept
+that short.
+(default: 40)
+
+=item style_url
+
+ style_url=>I<url>
+
+This gives the URL of a stylesheet; a LINK tag will be added to the
+output.
+
+=item tab_width
+
+ tab_width=>I<n>
+
+How many spaces equal a tab?
+(default: 8)
+
+=item table_type
+
+ table_type=>{ ALIGN=>0, PGSQL=>0, BORDER=>1, DELIM=>0 }
+
+This determines which types of tables will be recognised when "make_tables"
+is true. The possible types are ALIGN, PGSQL, BORDER and DELIM.
+(default: all types are true)
+
+=item title
+
+ title=>I<title>
+
+You can specify a title. Otherwise it will use a blank one.
+(default: nothing)
+
+=item titlefirst
+
+ titlefirst=>1
+
+Use the first non-blank line as the title.
+
+=item underline_length_tolerance
+
+ underline_length_tolerance=>I<n>
+
+How much longer or shorter can underlines be and still be underlines?
+(default: 1)
+
+=item underline_offset_tolerance
+
+ underline_offset_tolerance=>I<n>
+
+How far offset can underlines be and still be underlines?
+(default: 1)
+
+=item unhyphenation
+
+ unhyphenation=>0
+
+Enables unhyphenation of text.
+(default: true)
+
+=item use_mosaic_header
+
+ use_mosaic_header=>1
+
+Use this option if you want to force the heading styles to match what Mosaic
+outputs. (Underlined with "***"s is H1,
+with "==="s is H2, with "+++" is H3, with "---" is H4, with "~~~" is H5
+and with "..." is H6)
+This was the behavior of txt2html up to version 1.10.
+(default: false)
+
+=item use_preformat_marker
+
+ use_preformat_marker=>1
+
+Turn on preformatting when encountering "<PRE>" on a line by itself, and turn
+it off when there's a line containing only "</PRE>".
+When such preformatted text is detected, the PRE tag will be given the
+class 'quote_explicit'.
+(default: off)
+
+=item xhtml
+
+ xhtml=>1
+
+Try to make the output conform to the XHTML standard, including
+closing all open tags and marking empty tags correctly. This
+turns on --lower_case_tags and overrides the --doctype option.
+Note that if you add a header or a footer file, it is up to you
+to make it conform; the header/footer isn't touched by this.
+Likewise, if you make link-dictionary entries that break XHTML,
+then this won't fix them, except to the degree of putting all tags
+into lower-case.
+
+=back
+
+=head1 METHODS
+
+=cut
+
+#------------------------------------------------------------------------
+
+require Exporter;
+use Data::Dumper;
+
+our $PROG = 'HTML::TextToHTML';
+
+#------------------------------------------------------------------------
+use constant TEXT_TO_HTML => "TEXT_TO_HTML";
+
+########################################
+# Definitions (Don't change these)
+#
+
+# These are just constants I use for making bit vectors to keep track
+# of what modes I'm in and what actions I've taken on the current and
+# previous lines.
+
+our $NONE = 0;
+our $LIST = 1;
+our $HRULE = 2;
+our $PAR = 4;
+our $PRE = 8;
+our $END = 16;
+our $BREAK = 32;
+our $HEADER = 64;
+our $MAILHEADER = 128;
+our $MAILQUOTE = 256;
+our $CAPS = 512;
+our $LINK = 1024;
+our $PRE_EXPLICIT = 2048;
+our $TABLE = 4096;
+our $IND_BREAK = 8192;
+our $LIST_START = 16384;
+our $LIST_ITEM = 32768;
+
+# Constants for Link-processing
+# bit-vectors for what to do with a particular link-dictionary entry
+our $LINK_NOCASE = 1;
+our $LINK_EVAL = 2;
+our $LINK_HTML = 4;
+our $LINK_ONCE = 8;
+our $LINK_SECT_ONCE = 16;
+
+# Constants for Ordered Lists and Unordered Lists.
+# And Definition Lists.
+# I use this in the list stack to keep track of what's what.
+
+our $OL = 1;
+our $UL = 2;
+our $DL = 3;
+
+# Constants for table types
+our $TAB_ALIGN = 1;
+our $TAB_PGSQL = 2;
+our $TAB_BORDER = 3;
+our $TAB_DELIM = 4;
+
+# Character entity names
+# characters to replace with entities
+our %char_entities = (
+ "\241", "&iexcl;", "\242", "&cent;", "\243", "&pound;",
+ "\244", "&curren;", "\245", "&yen;", "\246", "&brvbar;",
+ "\247", "&sect;", "\250", "&uml;", "\251", "&copy;",
+ "\252", "&ordf;", "\253", "&laquo;", "\254", "&not;",
+ "\255", "&shy;", "\256", "&reg;", "\257", "&hibar;",
+ "\260", "&deg;", "\261", "&plusmn;", "\262", "&sup2;",
+ "\263", "&sup3;", "\264", "&acute;", "\265", "&micro;",
+ "\266", "&para;", "\270", "&cedil;", "\271", "&sup1;",
+ "\272", "&ordm;", "\273", "&raquo;", "\274", "&frac14;",
+ "\275", "&frac12;", "\276", "&frac34;", "\277", "&iquest;",
+ "\300", "&Agrave;", "\301", "&Aacute;", "\302", "&Acirc;",
+ "\303", "&Atilde;", "\304", "&Auml;", "\305", "&Aring;",
+ "\306", "&AElig;", "\307", "&Ccedil;", "\310", "&Egrave;",
+ "\311", "&Eacute;", "\312", "&Ecirc;", "\313", "&Euml;",
+ "\314", "&Igrave;", "\315", "&Iacute;", "\316", "&Icirc;",
+ "\317", "&Iuml;", "\320", "&ETH;", "\321", "&Ntilde;",
+ "\322", "&Ograve;", "\323", "&Oacute;", "\324", "&Ocirc;",
+ "\325", "&Otilde;", "\326", "&Ouml;", "\327", "&times;",
+ "\330", "&Oslash;", "\331", "&Ugrave;", "\332", "&Uacute;",
+ "\333", "&Ucirc;", "\334", "&Uuml;", "\335", "&Yacute;",
+ "\336", "&THORN;", "\337", "&szlig;", "\340", "&agrave;",
+ "\341", "&aacute;", "\342", "&acirc;", "\343", "&atilde;",
+ "\344", "&auml;", "\345", "&aring;", "\346", "&aelig;",
+ "\347", "&ccedil;", "\350", "&egrave;", "\351", "&eacute;",
+ "\352", "&ecirc;", "\353", "&euml;", "\354", "&igrave;",
+ "\355", "&iacute;", "\356", "&icirc;", "\357", "&iuml;",
+ "\360", "&eth;", "\361", "&ntilde;", "\362", "&ograve;",
+ "\363", "&oacute;", "\364", "&ocirc;", "\365", "&otilde;",
+ "\366", "&ouml;", "\367", "&divide;", "\370", "&oslash;",
+ "\371", "&ugrave;", "\372", "&uacute;", "\373", "&ucirc;",
+ "\374", "&uuml;", "\375", "&yacute;", "\376", "&thorn;",
+ "\377", "&yuml;", "\267", "&middot;",
+);
+
+# alignments for tables
+our @alignments = ('', '', ' ALIGN="RIGHT"', ' ALIGN="CENTER"');
+our @lc_alignments = ('', '', ' align="right"', ' align="center"');
+our @xhtml_alignments =
+ ('', '', ' style="text-align: right;"', ' style="text-align: center;"');
+
+#---------------------------------------------------------------#
+# Object interface
+#---------------------------------------------------------------#
+
+=head2 new
+
+ $conv = new HTML::TextToHTML()
+
+ $conv = new HTML::TextToHTML(titlefirst=>1,
+ ...
+ );
+
+Create a new object with new. If one argument is given, it is assumed
+to be a reference to an array of arguments. If more than one argument
+is given, it is assumed to be a hash of arguments. These arguments will
+be used in invocations of other methods.
+
+See L</OPTIONS> for the possible values of the arguments.
+
+=cut
+
+sub new
+{
+ my $invocant = shift;
+ my $self = {};
+
+ my $class = ref($invocant) || $invocant; # Object or class name
+ init_our_data($self);
+
+ # bless self
+ bless($self, $class);
+
+ $self->args(@_);
+
+ return $self;
+} # new
+
+=head2 args
+
+ $conv->args(short_line_length=>60,
+ titlefirst=>1,
+ ....
+ );
+
+Updates the current arguments/options of the HTML::TextToHTML object.
+Takes either a hash, or a reference to an array of arguments, which will
+be used in invocations of other methods.
+See L</OPTIONS> for the possible values of the arguments.
+
+NOTE: the reference-to-an-array usage is DEPRECATED and will be removed
+in the future.
+
+=cut
+
+sub args
+{
+ my $self = shift;
+ my %args = ();
+ my @arg_array = ();
+ if ( @_
+ && @_ == 1
+ && ref $_[0] eq 'ARRAY')
+ {
+ # this is a reference to an array -- use the old style args
+ my $aref = shift;
+ @arg_array = @{$aref};
+ }
+ elsif (@_)
+ {
+ %args = @_;
+ }
+
+ if (%args)
+ {
+ if ($self->{debug})
+ {
+ print STDERR "========args(hash)========\n";
+ print STDERR Dumper(%args);
+ }
+ foreach my $arg (keys %args)
+ {
+ if (defined $args{$arg})
+ {
+ if ($arg =~ /^-/)
+ {
+ $arg =~ s/^-//; # get rid of first dash
+ $arg =~ s/^-//; # get rid of possible second dash
+ }
+ if ($self->{debug})
+ {
+ print STDERR "--", $arg;
+ }
+ $self->{$arg} = $args{$arg};
+ if ($self->{debug})
+ {
+ print STDERR " ", $args{$arg}, "\n";
+ }
+ }
+ }
+ }
+ elsif (@arg_array)
+ {
+ if ($self->{debug})
+ {
+ print STDERR "========args(array)========\n";
+ print STDERR Dumper(@arg_array);
+ }
+ # the arg array may have filenames at the end of it,
+ # so don't consume them
+ my $look_at_args = 1;
+ while (@arg_array && $look_at_args)
+ {
+ my $arg = shift @arg_array;
+ # check for arguments which are bools,
+ # and thus have no companion value
+ if ($arg =~ /^-/)
+ {
+ $arg =~ s/^-//; # get rid of first dash
+ $arg =~ s/^-//; # get rid of possible second dash
+ if ($self->{debug})
+ {
+ print STDERR "--", $arg;
+ }
+ if ( $arg eq 'debug'
+ || $arg eq 'demoronize'
+ || $arg eq 'eight_bit_clean'
+ || $arg eq 'escape_HTML_chars'
+ || $arg eq 'explicit_headings'
+ || $arg eq 'extract'
+ || $arg eq 'link_only'
+ || $arg eq 'lower_case_tags'
+ || $arg eq 'mailmode'
+ || $arg eq 'make_anchors'
+ || $arg eq 'make_links'
+ || $arg eq 'make_tables'
+ || $arg eq 'preserve_indent'
+ || $arg eq 'titlefirst'
+ || $arg eq 'unhyphenation'
+ || $arg eq 'use_mosaic_header'
+ || $arg eq 'use_preformat_marker'
+ || $arg eq 'verbose'
+ || $arg eq 'xhtml')
+ {
+ $self->{$arg} = 1;
+ if ($self->{debug})
+ {
+ print STDERR "=true\n";
+ }
+ }
+ elsif ($arg eq 'nodebug'
+ || $arg eq 'nodemoronize'
+ || $arg eq 'noeight_bit_clean'
+ || $arg eq 'noescape_HTML_chars'
+ || $arg eq 'noexplicit_headings'
+ || $arg eq 'noextract'
+ || $arg eq 'nolink_only'
+ || $arg eq 'nolower_case_tags'
+ || $arg eq 'nomailmode'
+ || $arg eq 'nomake_anchors'
+ || $arg eq 'nomake_links'
+ || $arg eq 'nomake_tables'
+ || $arg eq 'nopreserve_indent'
+ || $arg eq 'notitlefirst'
+ || $arg eq 'nounhyphenation'
+ || $arg eq 'nouse_mosaic_header'
+ || $arg eq 'nouse_preformat_marker'
+ || $arg eq 'noverbose'
+ || $arg eq 'noxhtml')
+ {
+ $arg =~ s/^no//;
+ $self->{$arg} = 0;
+ if ($self->{debug})
+ {
+ print STDERR " $arg=false\n";
+ }
+ }
+ else
+ {
+ my $val = shift @arg_array;
+ if ($self->{debug})
+ {
+ print STDERR "=", $val, "\n";
+ }
+ # check the types
+ if (defined $arg && defined $val)
+ {
+ if ( $arg eq 'infile'
+ || $arg eq 'custom_heading_regexp'
+ || $arg eq 'links_dictionaries')
+ { # arrays
+ if ($val eq 'CLEAR')
+ {
+ $self->{$arg} = [];
+ }
+ else
+ {
+ push @{$self->{$arg}}, $val;
+ }
+ }
+ elsif ($arg eq 'file')
+ { # alternate for 'infile'
+ if ($val eq 'CLEAR')
+ {
+ $self->{infile} = [];
+ }
+ else
+ {
+ push @{$self->{infile}}, $val;
+ }
+ }
+ elsif ($arg eq 'table_type')
+ {
+ # hash
+ if ($val eq 'CLEAR')
+ {
+ $self->{$arg} = {};
+ }
+ else
+ {
+ my ($f1, $v1) = split(/=/, $val, 2);
+ $self->{$arg}->{$f1} = $v1;
+ }
+ }
+ else
+ {
+ $self->{$arg} = $val;
+ }
+ }
+ }
+ }
+ else
+ {
+ # if an option don't start with - then we've
+ # come to the end of the options
+ $look_at_args = 0;
+ }
+ }
+ }
+ if ($self->{debug})
+ {
+ print STDERR Dumper($self);
+ }
+
+ return 1;
+} # args
+
+=head2 process_chunk
+
+$newstring = $conv->process_chunk($mystring);
+
+Convert a string to a HTML fragment. This assumes that this string is
+at the least, a single paragraph, but it can contain more than that.
+This returns the processed string. If you want to pass arguments to
+alter the behaviour of this conversion, you need to do that earlier,
+either when you create the object, or with the L</args> method.
+
+ $newstring = $conv->process_chunk($mystring,
+ close_tags=>0);
+
+If there are open tags (such as lists) in the input string,
+process_chunk will now automatically close them, unless you specify not
+to, with the close_tags option.
+
+ $newstring = $conv->process_chunk($mystring,
+ is_fragment=>1);
+
+If you want this string to be treated as a fragment, and not assumed to
+be a paragraph, set is_fragment to true. If there is more than one
+paragraph in the string (ie it contains blank lines) then this option
+will be ignored.
+
+=cut
+
+sub process_chunk ($$;%)
+{
+ my $self = shift;
+ my $chunk = shift;
+ my %args = (
+ close_tags => 1,
+ is_fragment => 0,
+ @_
+ );
+
+ my $ret_str = '';
+ my @paras = split(/\r?\n\r?\n/, $chunk);
+ my $ind = 0;
+ if (@paras == 1) # just one paragraph
+ {
+ $ret_str .= $self->process_para(
+ $chunk,
+ close_tags => $args{close_tags},
+ is_fragment => $args{is_fragment}
+ );
+ }
+ else
+ {
+ my $ind = 0;
+ foreach my $para (@paras)
+ {
+ # if the paragraph doesn't end with a newline, add one
+ $para .= "\n" if ($para !~ /\n$/);
+ if ($ind == @paras - 1) # last one
+ {
+ $ret_str .= $self->process_para(
+ $para,
+ close_tags => $args{close_tags},
+ is_fragment => 0
+ );
+ }
+ else
+ {
+ $ret_str .= $self->process_para(
+ $para,
+ close_tags => 0,
+ is_fragment => 0
+ );
+ }
+ $ind++;
+ }
+ }
+ $ret_str;
+} # process_chunk
+
+=head2 process_para
+
+$newstring = $conv->process_para($mystring);
+
+Convert a string to a HTML fragment. This assumes that this string is
+at the most a single paragraph, with no blank lines in it. If you don't
+know whether your string will contain blank lines or not, use the
+L</process_chunk> method instead.
+
+This returns the processed string. If you want to pass arguments to
+alter the behaviour of this conversion, you need to do that earlier,
+either when you create the object, or with the L</args> method.
+
+ $newstring = $conv->process_para($mystring,
+ close_tags=>0);
+
+If there are open tags (such as lists) in the input string, process_para
+will now automatically close them, unless you specify not to, with the
+close_tags option.
+
+ $newstring = $conv->process_para($mystring,
+ is_fragment=>1);
+
+If you want this string to be treated as a fragment, and not assumed to be
+a paragraph, set is_fragment to true.
+
+=cut
+
+sub process_para ($$;%)
+{
+ my $self = shift;
+ my $para = shift;
+ my %args = (
+ close_tags => 1,
+ is_fragment => 0,
+ @_
+ );
+
+ # if this is an external call, do certain initializations
+ $self->do_init_call();
+
+ my $para_action = $NONE;
+
+ # tables and mailheaders don't carry over from one para to the next
+ if ($self->{__mode} & $TABLE)
+ {
+ $self->{__mode} ^= $TABLE;
+ }
+ if ($self->{__mode} & $MAILHEADER)
+ {
+ $self->{__mode} ^= $MAILHEADER;
+ }
+
+ # convert Microsoft character codes into sensible characters
+ if ($self->{demoronize})
+ {
+ demoronize_char($para);
+ }
+
+ # if we are not just linking, we are discerning structure
+ if (!$self->{link_only})
+ {
+
+ # Chop trailing whitespace and DOS CRs
+ $para =~ s/[ \011]*\015$//;
+ # Chop leading whitespace and DOS CRs
+ $para =~ s/^[ \011]*\015//;
+ $para =~ s/\r//g; # remove any stray carriage returns
+
+ my @done_lines = (); # lines which have been processed
+
+ # The PRE_EXPLICIT structure can carry over from one
+ # paragraph to the next, but it is ended with the
+ # explicit end-tag designated for it.
+ # Therefore we can shortcut for this by checking
+ # for the end of the PRE_EXPLICIT and chomping off
+ # the preformatted string part of this para before
+ # we have to split it into lines.
+ # Note that after this check, we could *still* be
+ # in PRE_EXPLICIT mode.
+ if ($self->{__mode} & $PRE_EXPLICIT)
+ {
+ my $pre_str =
+ $self->split_end_explicit_preformat(para_ref => \$para);
+ if ($pre_str)
+ {
+ push @done_lines, $pre_str;
+ }
+ }
+
+ if (defined $para && $para ne "")
+ {
+ #
+ # Now we split the paragraph into lines
+ #
+ my $para_len = length($para);
+ my @para_lines = split(/^/, $para);
+ my @para_line_len = ();
+ my @para_line_indent = ();
+ my @para_line_action = ();
+ my $i = 0;
+ foreach my $line (@para_lines)
+ {
+ # Change all tabs to spaces
+ while ($line =~ /\011/)
+ {
+ my $tw = $self->{tab_width};
+ $line =~ s/\011/" " x ($tw - (length($`) % $tw))/e;
+ }
+ push @para_line_len, length($line);
+ if ($line =~ /^\s*$/)
+ {
+ # if the line is blank, use the previous indent
+ # if there is one
+ push @para_line_indent,
+ ($i == 0 ? 0 : $para_line_indent[$i - 1]);
+ }
+ else
+ {
+ # count the number of leading spaces
+ my ($ws) = $line =~ /^( *)[^ ]/;
+ push @para_line_indent, length($ws);
+ }
+ push @para_line_action, $NONE;
+ $i++;
+ }
+
+ # There are two more structures which carry over from one
+ # paragraph to the next: LIST, PRE
+ # There are also certain things which will immediately end
+ # multi-paragraph LIST and PRE, if found at the start
+ # of a paragraph:
+ # A list will be ended by
+ # TABLE, MAILHEADER, HEADER, custom-header
+ # A PRE will be ended by
+ # TABLE, MAILHEADER and non-pre text
+
+ my $is_table = 0;
+ my $table_type = 0;
+ my $is_mailheader = 0;
+ my $is_header = 0;
+ my $is_custom_header = 0;
+ if (@{$self->{custom_heading_regexp}})
+ {
+ $is_custom_header =
+ $self->is_custom_heading(line => $para_lines[0]);
+ }
+ if ( $self->{make_tables}
+ && @para_lines > 1)
+ {
+ $table_type = $self->get_table_type(
+ rows_ref => \@para_lines,
+ para_len => $para_len
+ );
+ $is_table = ($table_type != 0);
+ }
+ if ( !$self->{explicit_headings}
+ && @para_lines > 1
+ && !$is_table)
+ {
+ $is_header = $self->is_heading(
+ line_ref => \$para_lines[0],
+ next_ref => \$para_lines[1]
+ );
+ }
+ # Note that it is concievable that someone has
+ # partially disabled mailmode by making a custom header
+ # which matches the start of mail.
+ # This is stupid, but allowable, so we check.
+ if ( $self->{mailmode}
+ && !$is_table
+ && !$is_custom_header)
+ {
+ $is_mailheader = $self->is_mailheader(rows_ref => \@para_lines);
+ }
+
+ # end the list if we can end it
+ if (
+ ($self->{__mode} & $LIST)
+ && ( $is_table
+ || $is_mailheader
+ || $is_header
+ || $is_custom_header)
+ )
+ {
+ my $list_end = '';
+ my $action = 0;
+ $self->endlist(
+ num_lists => $self->{__listnum},
+ prev_ref => \$list_end,
+ line_action_ref => \$action
+ );
+ push @done_lines, $list_end;
+ $self->{__prev_para_action} |= $END;
+ }
+
+ # end the PRE if we can end it
+ if (
+ ($self->{__mode} & $PRE)
+ && !($self->{__mode} & $PRE_EXPLICIT)
+ && ( $is_table
+ || $is_mailheader
+ || !$self->is_preformatted($para_lines[0]))
+ && ($self->{preformat_trigger_lines} != 0)
+ )
+ {
+ my $pre_end = '';
+ my $tag = $self->close_tag('PRE');
+ $pre_end = "${tag}\n";
+ $self->{__mode} ^= ($PRE & $self->{__mode});
+ push @done_lines, $pre_end;
+ $self->{__prev_para_action} |= $END;
+ }
+
+ # The PRE and PRE_EXPLICIT structure can carry over
+ # from one paragraph to the next, but because we don't
+ # want trailing newlines, such newlines would have been
+ # gotten rid of in the previous call. However, with
+ # a preformatted text, we do want the blank lines in it
+ # to be preserved, so let's add a blank line in here.
+ if ($self->{__mode} & $PRE)
+ {
+ push @done_lines, "\n";
+ }
+
+ # Now, we do certain things which are only found at the
+ # start of a paragraph:
+ # HEADER, custom-header, TABLE and MAILHEADER
+ # These could concievably eat the rest of the paragraph.
+
+ if ($is_custom_header)
+ {
+ # custom header eats the first line
+ my $header = shift @para_lines;
+ shift @para_line_len;
+ shift @para_line_indent;
+ shift @para_line_action;
+ $self->custom_heading(line_ref => \$header);
+ push @done_lines, $header;
+ $self->{__prev_para_action} |= $HEADER;
+ }
+ elsif ($is_header)
+ {
+ # normal header eats the first two lines
+ my $header = shift @para_lines;
+ shift @para_line_len;
+ shift @para_line_indent;
+ shift @para_line_action;
+ my $underline = shift @para_lines;
+ shift @para_line_len;
+ shift @para_line_indent;
+ shift @para_line_action;
+ $self->heading(
+ line_ref => \$header,
+ next_ref => \$underline
+ );
+ push @done_lines, $header;
+ $self->{__prev_para_action} |= $HEADER;
+ }
+
+ # do the table stuff on the array of lines
+ if ($self->{make_tables} && $is_table)
+ {
+ if (
+ $self->tablestuff(
+ table_type => $table_type,
+ rows_ref => \@para_lines,
+ para_len => $para_len
+ )
+ )
+ {
+ # this has used up all the lines
+ push @done_lines, @para_lines;
+ @para_lines = ();
+ }
+ }
+
+ # check of this para is a mail-header
+ if ( $is_mailheader
+ && !($self->{__mode} & $TABLE)
+ && @para_lines)
+ {
+ $self->mailheader(rows_ref => \@para_lines);
+ # this has used up all the lines
+ push @done_lines, @para_lines;
+ @para_lines = ();
+ }
+
+ #
+ # Now go through the paragraph lines one at a time
+ # Note that we won't have TABLE, MAILHEADER, HEADER modes
+ # because they would have eaten the lines
+ #
+ my $prev = '';
+ my $prev_action = $self->{__prev_para_action};
+ for (my $i = 0; $i < @para_lines; $i++)
+ {
+ my $prev_ref;
+ my $prev_action_ref;
+ my $prev_line_indent;
+ my $prev_line_len;
+ if ($i == 0)
+ {
+ $prev_ref = \$prev;
+ $prev_action_ref = \$prev_action;
+ $prev_line_indent = 0;
+ $prev_line_len = 0;
+ }
+ else
+ {
+ $prev_ref = \$para_lines[$i - 1];
+ $prev_action_ref = \$para_line_action[$i - 1];
+ $prev_line_indent = $para_line_indent[$i - 1];
+ $prev_line_len = $para_line_len[$i - 1];
+ }
+ my $next_ref;
+ if ($i == $#para_lines)
+ {
+ $next_ref = undef;
+ }
+ else
+ {
+ $next_ref = \$para_lines[$i + 1];
+ }
+
+ $para_lines[$i] = escape($para_lines[$i])
+ if ($self->{escape_HTML_chars});
+
+ if ($self->{mailmode}
+ && !($self->{__mode} & ($PRE_EXPLICIT)))
+ {
+ $self->mailquote(
+ line_ref => \$para_lines[$i],
+ line_action_ref => \$para_line_action[$i],
+ prev_ref => $prev_ref,
+ prev_action_ref => $prev_action_ref,
+ next_ref => $next_ref
+ );
+ }
+
+ if ( ($self->{__mode} & $PRE)
+ && ($self->{preformat_trigger_lines} != 0))
+ {
+ $self->endpreformat(
+ para_lines_ref => \@para_lines,
+ para_action_ref => \@para_line_action,
+ ind => $i,
+ prev_ref => $prev_ref
+ );
+ }
+
+ if (!($self->{__mode} & $PRE))
+ {
+ $self->hrule(
+ para_lines_ref => \@para_lines,
+ para_action_ref => \@para_line_action,
+ ind => $i
+ );
+ }
+ if (!($self->{__mode} & ($PRE))
+ && ($para_lines[$i] !~ /^\s*$/))
+ {
+ $self->liststuff(
+ para_lines_ref => \@para_lines,
+ para_action_ref => \@para_line_action,
+ para_line_indent_ref => \@para_line_indent,
+ ind => $i,
+ prev_ref => $prev_ref
+ );
+ }
+ if ( !($para_line_action[$i] & ($HEADER | $LIST))
+ && !($self->{__mode} & ($LIST | $PRE))
+ && $self->{__preformat_enabled})
+ {
+ $self->preformat(
+ mode_ref => \$self->{__mode},
+ line_ref => \$para_lines[$i],
+ line_action_ref => \$para_line_action[$i],
+ prev_ref => $prev_ref,
+ next_ref => $next_ref,
+ prev_action_ref => $prev_action_ref
+ );
+ }
+ if (!($self->{__mode} & ($PRE)))
+ {
+ $self->paragraph(
+ line_ref => \$para_lines[$i],
+ line_action_ref => \$para_line_action[$i],
+ prev_ref => $prev_ref,
+ prev_action_ref => $prev_action_ref,
+ line_indent => $para_line_indent[$i],
+ prev_indent => $prev_line_indent,
+ is_fragment => $args{is_fragment},
+ ind => $i,
+ );
+ }
+ if (!($self->{__mode} & ($PRE | $LIST)))
+ {
+ $self->shortline(
+ line_ref => \$para_lines[$i],
+ line_action_ref => \$para_line_action[$i],
+ prev_ref => $prev_ref,
+ prev_action_ref => $prev_action_ref,
+ prev_line_len => $prev_line_len
+ );
+ }
+ if (!($self->{__mode} & ($PRE)))
+ {
+ $self->caps(
+ line_ref => \$para_lines[$i],
+ line_action_ref => \$para_line_action[$i]
+ );
+ }
+
+ # put the "prev" line in front of the first line
+ $para_lines[$i] = $prev . $para_lines[$i]
+ if ($i == 0 && ($prev !~ /^\s*$/));
+ }
+
+ # para action is the action of the last line of the para
+ $para_action = $para_line_action[$#para_line_action];
+ $para_action = $NONE if (!defined $para_action);
+
+ # push them on the done lines
+ push @done_lines, @para_lines;
+ @para_lines = ();
+
+ }
+ # now put the para back together as one string
+ $para = join('', @done_lines);
+
+ # if this is a paragraph, and we are in XHTML mode,
+ # close an open paragraph.
+ if ($self->{xhtml})
+ {
+ my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
+ if (defined $open_tag && $open_tag eq 'P')
+ {
+ $para .= $self->close_tag('P');
+ }
+ }
+
+ if (
+ $self->{unhyphenation}
+
+ # ends in hyphen & next line starts w/letters
+ && ($para =~ /[^\W\d_]\-\n\s*[^\W\d_]/s) && !(
+ $self->{__mode} &
+ ($PRE | $HEADER | $MAILHEADER | $TABLE | $BREAK)
+ )
+ )
+ {
+ $self->unhyphenate_para(\$para);
+ }
+ # chop trailing newlines for continuing lists and PRE
+ if ( $self->{__mode} & $LIST
+ || $self->{__mode} & $PRE)
+ {
+ $para =~ s/\n$//g;
+ }
+ }
+
+ # apply links and bold/italic formatting
+ if ($para !~ /^\s*$/)
+ {
+ $self->apply_links(
+ para_ref => \$para,
+ para_action_ref => \$para_action
+ );
+ }
+
+ # close any open lists if required to
+ if ( $args{close_tags}
+ && $self->{__mode} & $LIST) # End all lists
+ {
+ $self->endlist(
+ num_lists => $self->{__listnum},
+ prev_ref => \$para,
+ line_action_ref => \$para_action
+ );
+ }
+ # close any open tags
+ if ($args{close_tags} && $self->{xhtml})
+ {
+ while (@{$self->{__tags}})
+ {
+ $para .= $self->close_tag('');
+ }
+ }
+
+ # convert remaining Microsoft character codes into sensible HTML
+ if ($self->{demoronize})
+ {
+ $para = demoronize_code($para);
+ }
+ # All the matching and formatting is done. Now we can
+ # replace non-ASCII characters with character entities.
+ if (!$self->{eight_bit_clean})
+ {
+ my @chars = split(//, $para);
+ foreach $_ (@chars)
+ {
+ $_ = $char_entities{$_} if defined($char_entities{$_});
+ }
+ $para = join('', @chars);
+ }
+
+ $self->{__prev_para_action} = $para_action;
+
+ return $para;
+} # process_para
+
+=head2 txt2html
+
+ $conv->txt2html(%args);
+
+Convert a text file to HTML. Takes a hash of arguments, or a reference
+to an array of arguments to customize the conversion; (this includes
+saying what file to convert!) See L</OPTIONS> for the possible values of
+the arguments. Arguments which have already been set with B<new> or
+B<args> will remain as they are, unless they are overridden.
+
+=cut
+
+sub txt2html ($;$)
+{
+ my $self = shift;
+
+ if (@_)
+ {
+ $self->args(@_);
+ }
+
+ $self->do_init_call();
+
+ my $outhandle;
+ my $outhandle_needs_closing;
+
+ # set up the output
+ if ($self->{outhandle})
+ {
+ $outhandle = $self->{outhandle};
+ $outhandle_needs_closing = 1;
+ }
+ elsif ($self->{outfile} eq "-")
+ {
+ $outhandle = *STDOUT;
+ $outhandle_needs_closing = 0;
+ }
+ else
+ {
+ open($outhandle, "> " . $self->{outfile})
+ || die "Error: unable to open ", $self->{outfile}, ": $!\n";
+ $outhandle_needs_closing = 1;
+ }
+
+ # slurp up a paragraph at a time, a file at a time
+ local $/ = "";
+ my $para = '';
+ my $count = 0;
+ my $print_count = 0;
+ my @sources = ();
+ my $source_type;
+ if ($self->{infile} and @{$self->{infile}})
+ {
+ @sources = @{$self->{infile}};
+ $source_type = 'file';
+ }
+ elsif ($self->{inhandle} and @{$self->{inhandle}})
+ {
+ @sources = @{$self->{inhandle}};
+ $source_type = 'filehandle';
+ }
+ elsif ($self->{instring} and @{$self->{instring}})
+ {
+ @sources = @{$self->{instring}};
+ $source_type = 'string';
+ }
+ my $inhandle;
+ my $inhandle_needs_closing = 0;
+ foreach my $source (@sources)
+ {
+ $inhandle = undef;
+ if ($source_type eq 'file')
+ {
+ if (!$source or $source eq '-')
+ {
+ $inhandle = *STDIN;
+ $inhandle_needs_closing = 0;
+ }
+ else
+ {
+ if (-f $source && open($inhandle, $source))
+ {
+ $inhandle_needs_closing = 1;
+ }
+ else # error
+ {
+ warn "Could not open $source\n";
+ next;
+ }
+ }
+ }
+ elsif ($source_type eq 'filehandle')
+ {
+ $inhandle = $source;
+ $inhandle_needs_closing = 1;
+ }
+ if ($source_type eq 'string')
+ {
+ # process the string
+ $para = $_;
+ $para =~ s/\n$//; # trim the endline
+ if ($count == 0)
+ {
+ $self->do_file_start($outhandle, $para);
+ }
+ $self->{__done_with_sect_link} = [];
+ $para = $self->process_chunk($para, close_tags => 0);
+ print $outhandle $para, "\n";
+ $print_count++;
+ $count++;
+ }
+ else # file or filehandle
+ {
+ while (<$inhandle>)
+ {
+ $para = $_;
+ $para =~ s/\n$//; # trim the endline
+ if ($count == 0)
+ {
+ $self->do_file_start($outhandle, $para);
+ }
+ $self->{__done_with_sect_link} = [];
+ $para = $self->process_chunk($para, close_tags => 0);
+ print $outhandle $para, "\n";
+ $print_count++;
+ $count++;
+ }
+ if ($inhandle_needs_closing)
+ {
+ close($inhandle);
+ }
+ }
+ } # for each file
+
+ $self->{__prev} = "";
+ if ($self->{__mode} & $LIST) # End all lists
+ {
+ $self->endlist(
+ num_lists => $self->{__listnum},
+ prev_ref => \$self->{__prev},
+ line_action_ref => \$self->{__line_action}
+ );
+ }
+ print $outhandle $self->{__prev};
+
+ # end open preformats
+ if ($self->{__mode} & $PRE)
+ {
+ my $tag = $self->close_tag('PRE');
+ print $outhandle $tag;
+ }
+
+ # close all open tags
+ if ( $self->{xhtml}
+ && !$self->{extract}
+ && @{$self->{__tags}})
+ {
+ if ($self->{dict_debug} & 8)
+ {
+ print STDERR "closing all tags at end\n";
+ }
+ # close any open tags (until we get to the body)
+ my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
+ while (@{$self->{__tags}}
+ && $open_tag ne 'BODY'
+ && $open_tag ne 'HTML')
+ {
+ print $outhandle $self->close_tag('');
+ $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
+ }
+ print $outhandle "\n";
+ }
+
+ if ($self->{append_file})
+ {
+ if (-r $self->{append_file})
+ {
+ open(APPEND, $self->{append_file});
+ while (<APPEND>)
+ {
+ print $outhandle $_;
+ $print_count++;
+ }
+ close(APPEND);
+ }
+ else
+ {
+ print STDERR "Can't find or read file ", $self->{append_file},
+ " to append.\n";
+ }
+ }
+
+ # print the closing tags (if we have printed stuff at all)
+ if ($print_count && !$self->{extract})
+ {
+ print $outhandle $self->close_tag('BODY'), "\n";
+ print $outhandle $self->close_tag('HTML'), "\n";
+ }
+ if ($outhandle_needs_closing)
+ {
+ close($outhandle);
+ }
+ return 1;
+}
+
+#---------------------------------------------------------------#
+# Init-related subroutines
+
+#--------------------------------#
+# Name: init_our_data
+# Args:
+# $self
+sub init_our_data ($)
+{
+ my $self = shift;
+
+ $self->{debug} = 0;
+
+ #
+ # All the options, in alphabetical order
+ #
+ $self->{append_file} = '';
+ $self->{append_head} = '';
+ $self->{body_deco} = '';
+ $self->{bullets} = '-=o*\267';
+ $self->{bullets_ordered} = '';
+ $self->{bold_delimiter} = '#';
+ $self->{caps_tag} = 'STRONG';
+ $self->{custom_heading_regexp} = [];
+ $self->{default_link_dict} =
+ ($ENV{HOME} ? "$ENV{HOME}/.txt2html.dict" : '.txt2html.dict');
+ $self->{dict_debug} = 0;
+ $self->{doctype} = "-//W3C//DTD HTML 3.2 Final//EN";
+ $self->{demoronize} = 1;
+ $self->{eight_bit_clean} = 0;
+ $self->{escape_HTML_chars} = 1;
+ $self->{explicit_headings} = 0;
+ $self->{extract} = 0;
+ $self->{hrule_min} = 4;
+ $self->{indent_width} = 2;
+ $self->{indent_par_break} = 0;
+ $self->{infile} = [];
+ $self->{inhandle} = [];
+ $self->{instring} = [];
+ $self->{italic_delimiter} = '*';
+ $self->{links_dictionaries} = [];
+ $self->{link_only} = 0;
+ $self->{lower_case_tags} = 0;
+ $self->{mailmode} = 0;
+ $self->{make_anchors} = 1;
+ $self->{make_links} = 1;
+ $self->{make_tables} = 0;
+ $self->{min_caps_length} = 3;
+ $self->{outfile} = '-';
+ $self->{par_indent} = 2;
+ $self->{preformat_trigger_lines} = 2;
+ $self->{endpreformat_trigger_lines} = 2;
+ $self->{preformat_start_marker} = "^(:?(:?&lt;)|<)PRE(:?(:?&gt;)|>)\$";
+ $self->{preformat_end_marker} = "^(:?(:?&lt;)|<)/PRE(:?(:?&gt;)|>)\$";
+ $self->{preformat_whitespace_min} = 5;
+ $self->{prepend_file} = '';
+ $self->{preserve_indent} = 0;
+ $self->{short_line_length} = 40;
+ $self->{style_url} = '';
+ $self->{tab_width} = 8;
+ $self->{table_type} = {
+ ALIGN => 1,
+ PGSQL => 1,
+ BORDER => 1,
+ DELIM => 1,
+ };
+ $self->{title} = '';
+ $self->{titlefirst} = 0;
+ $self->{underline_length_tolerance} = 1;
+ $self->{underline_offset_tolerance} = 1;
+ $self->{unhyphenation} = 1;
+ $self->{use_mosaic_header} = 0;
+ $self->{use_preformat_marker} = 0;
+ $self->{xhtml} = 0;
+
+ # accumulation variables
+ $self->{__file} = ""; # Current file being processed
+ $self->{__heading_styles} = {};
+ $self->{__num_heading_styles} = 0;
+ $self->{__links_table} = {};
+ $self->{__links_table_order} = [];
+ $self->{__search_patterns} = [];
+ $self->{__repl_code} = [];
+ $self->{__prev_para_action} = 0;
+ $self->{__non_header_anchor} = 0;
+ $self->{__mode} = 0;
+ $self->{__listnum} = 0;
+ $self->{__list_nice_indent} = "";
+ $self->{__list_indent} = [];
+
+ $self->{__call_init_done} = 0;
+
+ #
+ # The global links data
+ #
+ # This is stored in the DATA handle, after the __DATA__ at
+ # the end of this file; but because the test scripts (and possibly
+ # other scripts) don't just create one instance of this object,
+ # we have to remember the position of the DATA handle
+ # and reset it after we've read from it, just in case
+ # we have to read from it again.
+ # This also means that we don't close it, either. Hope that doesn't
+ # cause a problem...
+ #
+ my $curpos = tell(DATA); # remember the __DATA__ position
+ my @lines = ();
+ while (<DATA>)
+ {
+ # skip lines that start with '#'
+ next if /^\#/;
+ # skip lines that end with unescaped ':'
+ next if /^.*[^\\]:\s*$/;
+ push @lines, $_;
+ }
+ # reset the data handle to the start, just in case
+ seek(DATA, $curpos, 0);
+ $self->{__global_links_data} = join('', @lines);
+} # init_our_data
+
+#---------------------------------------------------------------#
+# txt2html-related subroutines
+
+#--------------------------------#
+# Name: deal_with_options
+# do extra processing related to particular options
+# Args:
+# $self
+sub deal_with_options ($)
+{
+ my $self = shift;
+
+ if ($self->{links_dictionaries})
+ {
+ # only put into the links dictionaries files which are readable
+ my @dict_files = @{$self->{links_dictionaries}};
+ $self->args(links_dictionaries => []);
+
+ foreach my $ld (@dict_files)
+ {
+ if (-r $ld)
+ {
+ $self->{'make_links'} = 1;
+ $self->args(['--links_dictionaries', $ld]);
+ }
+ else
+ {
+ print STDERR "Can't find or read link-file $ld\n";
+ }
+ }
+ }
+ if (!$self->{make_links})
+ {
+ $self->{'links_dictionaries'} = 0;
+ }
+ if ($self->{append_file})
+ {
+ if (!-r $self->{append_file})
+ {
+ print STDERR "Can't find or read ", $self->{append_file}, "\n";
+ $self->{append_file} = '';
+ }
+ }
+ if ($self->{prepend_file})
+ {
+ if (!-r $self->{prepend_file})
+ {
+ print STDERR "Can't find or read ", $self->{prepend_file}, "\n";
+ $self->{'prepend_file'} = '';
+ }
+ }
+ if ($self->{append_head})
+ {
+ if (!-r $self->{append_head})
+ {
+ print STDERR "Can't find or read ", $self->{append_head}, "\n";
+ $self->{'append_head'} = '';
+ }
+ }
+
+ if (!$self->{outfile})
+ {
+ $self->{'outfile'} = "-";
+ }
+
+ $self->{'preformat_trigger_lines'} = 0
+ if ($self->{preformat_trigger_lines} < 0);
+ $self->{'preformat_trigger_lines'} = 2
+ if ($self->{preformat_trigger_lines} > 2);
+
+ $self->{'endpreformat_trigger_lines'} = 1
+ if ($self->{preformat_trigger_lines} == 0);
+ $self->{'endpreformat_trigger_lines'} = 0
+ if ($self->{endpreformat_trigger_lines} < 0);
+ $self->{'endpreformat_trigger_lines'} = 2
+ if ($self->{endpreformat_trigger_lines} > 2);
+
+ $self->{__preformat_enabled} =
+ (($self->{endpreformat_trigger_lines} != 0)
+ || $self->{use_preformat_marker});
+
+ if ($self->{use_mosaic_header})
+ {
+ my $num_heading_styles = 0;
+ my %heading_styles = ();
+ $heading_styles{"*"} = ++$num_heading_styles;
+ $heading_styles{"="} = ++$num_heading_styles;
+ $heading_styles{"+"} = ++$num_heading_styles;
+ $heading_styles{"-"} = ++$num_heading_styles;
+ $heading_styles{"~"} = ++$num_heading_styles;
+ $heading_styles{"."} = ++$num_heading_styles;
+ $self->{__heading_styles} = \%heading_styles;
+ $self->{__num_heading_styles} = $num_heading_styles;
+ }
+ # XHTML implies lower case
+ $self->{'lower_case_tags'} = 1 if ($self->{xhtml});
+}
+
+sub escape ($)
+{
+ my ($text) = @_;
+ $text =~ s/&/&amp;/g;
+ $text =~ s/>/&gt;/g;
+ $text =~ s/</&lt;/g;
+ return $text;
+}
+
+# Added by Alan Jackson, alan at ajackson dot org, and based
+# on the demoronize script by John Walker, http://www.fourmilab.ch/
+# Convert Microsoft character entities into characters.
+sub demoronize_char($)
+{
+ my $s = shift;
+ # Map strategically incompatible non-ISO characters in the
+ # range 0x82 -- 0x9F into plausible substitutes where
+ # possible.
+
+ $s =~ s/\x82/,/g;
+ $s =~ s/\x84/,,/g;
+ $s =~ s/\x85/.../g;
+
+ $s =~ s/\x88/^/g;
+
+ $s =~ s/\x8B/</g;
+ $s =~ s/\x8C/Oe/g;
+
+ $s =~ s/\x91/`/g;
+ $s =~ s/\x92/'/g;
+ $s =~ s/\x93/"/g;
+ $s =~ s/\x94/"/g;
+ $s =~ s/\x95/*/g;
+ $s =~ s/\x96/-/g;
+ $s =~ s/\x97/--/g;
+
+ $s =~ s/\x9B/>/g;
+ $s =~ s/\x9C/oe/g;
+
+ return $s;
+}
+
+# convert Microsoft character entities into HTML code
+sub demoronize_code($)
+{
+ my $s = shift;
+ # Map strategically incompatible non-ISO characters in the
+ # range 0x82 -- 0x9F into plausible substitutes where
+ # possible.
+
+ $s =~ s-\x83-<em>f</em>-g;
+
+ $s =~ s-\x98-<sup>~</sup>-g;
+ $s =~ s-\x99-<sup>TM</sup>-g;
+
+ return $s;
+}
+
+# output the tag wanted (add the <> and the / if necessary)
+# - output in lower or upper case
+# - do tag-related processing
+# options:
+# tag_type=>'start' | tag_type=>'end' | tag_type=>'empty'
+# (default start)
+# inside_tag=>string (default empty)
+sub get_tag ($$;%)
+{
+ my $self = shift;
+ my $in_tag = shift;
+ my %args = (
+ tag_type => 'start',
+ inside_tag => '',
+ @_
+ );
+ my $inside_tag = $args{inside_tag};
+
+ my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
+ if (!defined $open_tag)
+ {
+ $open_tag = '';
+ }
+ # close any open tags that need closing
+ # Note that we only have to check for the structural tags we make,
+ # not every possible HTML tag
+ my $tag_prefix = '';
+ if ($self->{xhtml})
+ {
+ if ( $open_tag eq 'P'
+ and $in_tag eq 'P'
+ and $args{tag_type} ne 'end')
+ {
+ $tag_prefix = $self->close_tag('P');
+ }
+ elsif ( $open_tag eq 'P'
+ and $in_tag =~ /^(HR|UL|OL|DL|PRE|TABLE|H)/)
+ {
+ $tag_prefix = $self->close_tag('P');
+ }
+ elsif ( $open_tag eq 'LI'
+ and $in_tag eq 'LI'
+ and $args{tag_type} ne 'end')
+ {
+ # close a LI before the next LI
+ $tag_prefix = $self->close_tag('LI');
+ }
+ elsif ( $open_tag eq 'LI'
+ and $in_tag =~ /^(UL|OL)$/
+ and $args{tag_type} eq 'end')
+ {
+ # close the LI before the list closes
+ $tag_prefix = $self->close_tag('LI');
+ }
+ elsif ( $open_tag eq 'DT'
+ and $in_tag eq 'DD'
+ and $args{tag_type} ne 'end')
+ {
+ # close a DT before the next DD
+ $tag_prefix = $self->close_tag('DT');
+ }
+ elsif ( $open_tag eq 'DD'
+ and $in_tag eq 'DT'
+ and $args{tag_type} ne 'end')
+ {
+ # close a DD before the next DT
+ $tag_prefix = $self->close_tag('DD');
+ }
+ elsif ( $open_tag eq 'DD'
+ and $in_tag eq 'DL'
+ and $args{tag_type} eq 'end')
+ {
+ # close the DD before the list closes
+ $tag_prefix = $self->close_tag('DD');
+ }
+ }
+
+ my $out_tag = $in_tag;
+ if ($args{tag_type} eq 'end')
+ {
+ $out_tag = $self->close_tag($in_tag);
+ }
+ else
+ {
+ if ($self->{lower_case_tags})
+ {
+ $out_tag =~ tr/A-Z/a-z/;
+ }
+ else # upper case
+ {
+ $out_tag =~ tr/a-z/A-Z/;
+ }
+ if ($args{tag_type} eq 'empty')
+ {
+ if ($self->{xhtml})
+ {
+ $out_tag = "<${out_tag}${inside_tag}/>";
+ }
+ else
+ {
+ $out_tag = "<${out_tag}${inside_tag}>";
+ }
+ }
+ else
+ {
+ push @{$self->{__tags}}, $in_tag;
+ $out_tag = "<${out_tag}${inside_tag}>";
+ }
+ }
+ $out_tag = $tag_prefix . $out_tag if $tag_prefix;
+ if ($self->{dict_debug} & 8)
+ {
+ print STDERR
+ "open_tag = '${open_tag}', in_tag = '${in_tag}', tag_type = ",
+ $args{tag_type},
+ ", inside_tag = '${inside_tag}', out_tag = '$out_tag'\n";
+ }
+
+ return $out_tag;
+} # get_tag
+
+# close the open tag
+sub close_tag ($$)
+{
+ my $self = shift;
+ my $in_tag = shift;
+
+ my $open_tag = pop @{$self->{__tags}};
+ $in_tag ||= $open_tag;
+ # put the open tag back on the stack if the in-tag is not the same
+ if (defined $open_tag && $open_tag ne $in_tag)
+ {
+ push @{$self->{__tags}}, $open_tag;
+ }
+ my $out_tag = $in_tag;
+ if ($self->{lower_case_tags})
+ {
+ $out_tag =~ tr/A-Z/a-z/;
+ }
+ else # upper case
+ {
+ $out_tag =~ tr/a-z/A-Z/;
+ }
+ $out_tag = "<\/${out_tag}>";
+ if ($self->{dict_debug} & 8)
+ {
+ print STDERR
+"close_tag: open_tag = '${open_tag}', in_tag = '${in_tag}', out_tag = '$out_tag'\n";
+ }
+
+ return $out_tag;
+}
+
+sub hrule ($%)
+{
+ my $self = shift;
+ my %args = (
+ para_lines_ref => undef,
+ para_action_ref => undef,
+ ind => 0,
+ @_
+ );
+ my $para_lines_ref = $args{para_lines_ref};
+ my $para_action_ref = $args{para_action_ref};
+ my $ind = $args{ind};
+
+ my $hrmin = $self->{hrule_min};
+ if ($para_lines_ref->[$ind] =~ /^\s*([-_~=\*]\s*){$hrmin,}$/)
+ {
+ my $tag = $self->get_tag("HR", tag_type => 'empty');
+ $para_lines_ref->[$ind] = "$tag\n";
+ $para_action_ref->[$ind] |= $HRULE;
+ }
+ elsif ($para_lines_ref->[$ind] =~ /\014/)
+ {
+ # Linefeeds become horizontal rules
+ $para_action_ref->[$ind] |= $HRULE;
+ my $tag = $self->get_tag("HR", tag_type => 'empty');
+ $para_lines_ref->[$ind] =~ s/\014/\n${tag}\n/g;
+ }
+}
+
+sub shortline ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ line_action_ref => undef,
+ prev_ref => undef,
+ prev_action_ref => undef,
+ prev_line_len => 0,
+ @_
+ );
+ my $mode_ref = $args{mode_ref};
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+ my $prev_ref = $args{prev_ref};
+ my $prev_action_ref = $args{prev_action_ref};
+ my $prev_line_len = $args{prev_line_len};
+
+ # Short lines should be broken even on list item lines iff the
+ # following line is more text. I haven't figured out how to do
+ # that yet. For now, I'll just not break on short lines in lists.
+ # (sorry)
+
+ my $tag = $self->get_tag('BR', tag_type => 'empty');
+ if (
+ ${$line_ref} !~ /^\s*$/
+ && ${$prev_ref} !~ /^\s*$/
+ && ($prev_line_len < $self->{short_line_length})
+ && !(
+ ${$line_action_ref} &
+ ($END | $HEADER | $HRULE | $LIST | $IND_BREAK | $PAR)
+ )
+ && !(${$prev_action_ref} & ($HEADER | $HRULE | $BREAK | $IND_BREAK))
+ )
+ {
+ ${$prev_ref} .= $tag . chop(${$prev_ref});
+ ${$prev_action_ref} |= $BREAK;
+ }
+}
+
+sub is_mailheader ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+
+ # a mail header is assumed to be the whole
+ # paragraph which starts with a From , From: or Newsgroups: line
+
+ if ($rows_ref->[0] =~ /^(From:?)|(Newsgroups:) /)
+ {
+ return 1;
+ }
+ return 0;
+
+} # is_mailheader
+
+sub mailheader ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+
+ # a mail header is assumed to be the whole
+ # paragraph which starts with a From: or Newsgroups: line
+ my $tag = '';
+ my @rows = @{$rows_ref};
+
+ if ($self->is_mailheader(%args))
+ {
+ $self->{__mode} |= $MAILHEADER;
+ if ($self->{escape_HTML_chars})
+ {
+ $rows[0] = escape($rows[0]);
+ }
+ $self->anchor_mail(\$rows[0]);
+ chomp ${rows}[0];
+ $tag = $self->get_tag('P', inside_tag => " class='mail_header'");
+ my $tag2 = $self->get_tag('BR', tag_type => 'empty');
+ $rows[0] =
+ join('', "<!-- New Message -->\n", $tag, $rows[0], $tag2, "\n");
+ # now put breaks on the rest of the paragraph
+ # apart from the last line
+ for (my $rn = 1; $rn < @rows; $rn++)
+ {
+ if ($self->{escape_HTML_chars})
+ {
+ $rows[$rn] = escape($rows[$rn]);
+ }
+ if ($rn != (@rows - 1))
+ {
+ $tag = $self->get_tag('BR', tag_type => 'empty');
+ chomp $rows[$rn];
+ $rows[$rn] =~ s/$/${tag}\n/;
+ }
+ }
+ }
+ @{$rows_ref} = @rows;
+
+} # mailheader
+
+sub mailquote ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ line_action_ref => undef,
+ prev_ref => undef,
+ prev_action_ref => undef,
+ next_ref => undef,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+ my $prev_ref = $args{prev_ref};
+ my $prev_action_ref = $args{prev_action_ref};
+ my $next_ref = $args{next_ref};
+
+ my $tag = '';
+ if (
+ (
+ (${$line_ref} =~ /^\w*&gt/) # Handle "FF> Werewolves."
+ || (${$line_ref} =~ /^[\|:]/)
+ ) # Handle "[|:] There wolves."
+ && defined($next_ref) && (${$next_ref} !~ /^\s*$/)
+ )
+ {
+ $tag = $self->get_tag('BR', tag_type => 'empty');
+ ${$line_ref} =~ s/$/${tag}/;
+ ${$line_action_ref} |= ($BREAK | $MAILQUOTE);
+ if (!(${$prev_action_ref} & ($BREAK | $MAILQUOTE)))
+ {
+ $tag = $self->get_tag('P', inside_tag => " class='quote_mail'");
+ ${$prev_ref} .= $tag;
+ ${$line_action_ref} |= $PAR;
+ }
+ }
+}
+
+# Subtracts modes listed in $mask from $vector.
+sub subtract_modes ($$)
+{
+ my ($vector, $mask) = @_;
+ return ($vector | $mask) - $mask;
+}
+
+sub paragraph ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ line_action_ref => undef,
+ prev_ref => undef,
+ prev_action_ref => undef,
+ line_indent => 0,
+ prev_indent => 0,
+ is_fragment => 0,
+ ind => 0,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+ my $prev_ref = $args{prev_ref};
+ my $prev_action_ref = $args{prev_action_ref};
+ my $line_indent = $args{line_indent};
+ my $prev_indent = $args{prev_indent};
+ my $is_fragment = $args{is_fragment};
+ my $line_no = $args{ind};
+
+ my $tag = '';
+ if (
+ ${$line_ref} !~ /^\s*$/
+ && !subtract_modes(
+ ${$line_action_ref}, $END | $MAILQUOTE | $CAPS | $BREAK
+ )
+ && ( ${$prev_ref} =~ /^\s*$/
+ || (${$line_action_ref} & $END)
+ || ($line_indent > $prev_indent + $self->{par_indent}))
+ && !($is_fragment && $line_no == 0)
+ )
+ {
+
+ if ( $self->{indent_par_break}
+ && ${$prev_ref} !~ /^\s*$/
+ && !(${$line_action_ref} & $END)
+ && ($line_indent > $prev_indent + $self->{par_indent}))
+ {
+ $tag = $self->get_tag('BR', tag_type => 'empty');
+ ${$prev_ref} .= $tag;
+ ${$prev_ref} .= "&nbsp;" x $line_indent;
+ ${$line_ref} =~ s/^ {$line_indent}//;
+ ${$prev_action_ref} |= $BREAK;
+ ${$line_action_ref} |= $IND_BREAK;
+ }
+ elsif ($self->{preserve_indent})
+ {
+ $tag = $self->get_tag('P');
+ ${$prev_ref} .= $tag;
+ ${$prev_ref} .= "&nbsp;" x $line_indent;
+ ${$line_ref} =~ s/^ {$line_indent}//;
+ ${$line_action_ref} |= $PAR;
+ }
+ else
+ {
+ $tag = $self->get_tag('P');
+ ${$prev_ref} .= $tag;
+ ${$line_action_ref} |= $PAR;
+ }
+ }
+ # detect also a continuing indentation at the same level
+ elsif ($self->{indent_par_break}
+ && !($self->{__mode} & ($PRE | $TABLE | $LIST))
+ && ${$prev_ref} !~ /^\s*$/
+ && !(${$line_action_ref} & $END)
+ && (${$prev_action_ref} & ($IND_BREAK | $PAR))
+ && !subtract_modes(${$line_action_ref}, $END | $MAILQUOTE | $CAPS)
+ && ($line_indent > $self->{par_indent})
+ && ($line_indent == $prev_indent))
+ {
+ $tag = $self->get_tag('BR', tag_type => 'empty');
+ ${$prev_ref} .= $tag;
+ ${$prev_ref} .= "&nbsp;" x $line_indent;
+ ${$line_ref} =~ s/^ {$line_indent}//;
+ ${$prev_action_ref} |= $BREAK;
+ ${$line_action_ref} |= $IND_BREAK;
+ }
+}
+
+sub listprefix ($$)
+{
+ my $self = shift;
+ my $line = shift;
+
+ my ($prefix, $number, $rawprefix, $term);
+
+ my $bullets = $self->{bullets};
+ my $bullets_ordered = $self->{bullets_ordered};
+ my $number_match = '(\d+|[^\W\d])';
+ if ($bullets_ordered)
+ {
+ $number_match = '(\d+|[a-zA-Z]|[' . "${bullets_ordered}])";
+ }
+ $self->{__number_match} = $number_match;
+ my $term_match = '(\w\w+)';
+ $self->{__term_match} = $term_match;
+ return (0, 0, 0, 0)
+ if ( !($line =~ /^\s*[${bullets}]\s+\S/)
+ && !($line =~ /^\s*${number_match}[\.\)\]:]\s+\S/)
+ && !($line =~ /^\s*${term_match}:$/));
+
+ ($term) = $line =~ /^\s*${term_match}:$/;
+ ($number) = $line =~ /^\s*${number_match}\S\s+\S/;
+ $number = 0 unless defined($number);
+ if ( $bullets_ordered
+ && $number =~ /[${bullets_ordered}]/)
+ {
+ $number = 1;
+ }
+
+ # That slippery exception of "o" as a bullet
+ # (This ought to be determined using the context of what lists
+ # we have in progress, but this will probably work well enough.)
+ if ($bullets =~ /o/ && $line =~ /^\s*o\s/)
+ {
+ $number = 0;
+ }
+
+ if ($term)
+ {
+ ($rawprefix) = $line =~ /^(\s*${term_match}.)$/;
+ $prefix = $rawprefix;
+ $prefix =~ s/${term_match}//; # Take the term out
+ }
+ elsif ($number)
+ {
+ ($rawprefix) = $line =~ /^(\s*${number_match}.)/;
+ $prefix = $rawprefix;
+ $prefix =~ s/${number_match}//; # Take the number out
+ }
+ else
+ {
+ ($rawprefix) = $line =~ /^(\s*[${bullets}].)/;
+ $prefix = $rawprefix;
+ }
+ ($prefix, $number, $rawprefix, $term);
+} # listprefix
+
+sub startlist ($%)
+{
+ my $self = shift;
+ my %args = (
+ prefix => '',
+ number => 0,
+ rawprefix => '',
+ term => '',
+ para_lines_ref => undef,
+ para_action_ref => undef,
+ ind => 0,
+ prev_ref => undef,
+ total_prefix => '',
+ @_
+ );
+ my $prefix = $args{prefix};
+ my $number = $args{number};
+ my $rawprefix = $args{rawprefix};
+ my $term = $args{term};
+ my $para_lines_ref = $args{para_lines_ref};
+ my $para_action_ref = $args{para_action_ref};
+ my $ind = $args{ind};
+ my $prev_ref = $args{prev_ref};
+
+ my $tag = '';
+ $self->{__listprefix}->[$self->{__listnum}] = $prefix;
+ if ($number)
+ {
+
+ # It doesn't start with 1,a,A. Let's not screw with it.
+ if (($number ne "1") && ($number ne "a") && ($number ne "A"))
+ {
+ return 0;
+ }
+ $tag = $self->get_tag('OL');
+ ${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
+ $self->{__list}->[$self->{__listnum}] = $OL;
+ }
+ elsif ($term)
+ {
+ $tag = $self->get_tag('DL');
+ ${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
+ $self->{__list}->[$self->{__listnum}] = $DL;
+ }
+ else
+ {
+ $tag = $self->get_tag('UL');
+ ${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
+ $self->{__list}->[$self->{__listnum}] = $UL;
+ }
+
+ $self->{__list_indent}->[$self->{__listnum}] = length($args{total_prefix});
+ $self->{__listnum}++;
+ $self->{__list_nice_indent} =
+ " " x $self->{__listnum} x $self->{indent_width};
+ $para_action_ref->[$ind] |= $LIST;
+ $para_action_ref->[$ind] |= $LIST_START;
+ $self->{__mode} |= $LIST;
+ 1;
+} # startlist
+
+# End N lists
+sub endlist ($%)
+{
+ my $self = shift;
+ my %args = (
+ num_lists => 0,
+ prev_ref => undef,
+ line_action_ref => undef,
+ @_
+ );
+ my $n = $args{num_lists};
+ my $prev_ref = $args{prev_ref};
+ my $line_action_ref = $args{line_action_ref};
+
+ my $tag = '';
+ for (; $n > 0; $n--, $self->{__listnum}--)
+ {
+ $self->{__list_nice_indent} =
+ " " x ($self->{__listnum} - 1) x $self->{indent_width};
+ if ($self->{__list}->[$self->{__listnum} - 1] == $UL)
+ {
+ $tag = $self->get_tag('UL', tag_type => 'end');
+ ${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
+ pop @{$self->{__list_indent}};
+ }
+ elsif ($self->{__list}->[$self->{__listnum} - 1] == $OL)
+ {
+ $tag = $self->get_tag('OL', tag_type => 'end');
+ ${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
+ pop @{$self->{__list_indent}};
+ }
+ elsif ($self->{__list}->[$self->{__listnum} - 1] == $DL)
+ {
+ $tag = $self->get_tag('DL', tag_type => 'end');
+ ${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
+ pop @{$self->{__list_indent}};
+ }
+ else
+ {
+ print STDERR "Encountered list of unknown type\n";
+ }
+ }
+ ${$line_action_ref} |= $END;
+ $self->{__mode} ^= $LIST if (!$self->{__listnum});
+} # endlist
+
+sub continuelist ($%)
+{
+ my $self = shift;
+ my %args = (
+ para_lines_ref => undef,
+ para_action_ref => undef,
+ ind => 0,
+ term => '',
+ @_
+ );
+ my $para_lines_ref = $args{para_lines_ref};
+ my $para_action_ref = $args{para_action_ref};
+ my $ind = $args{ind};
+ my $term = $args{term};
+
+ my $list_indent = $self->{__list_nice_indent};
+ my $bullets = $self->{bullets};
+ my $num_match = $self->{__number_match};
+ my $term_match = $self->{__term_match};
+ my $tag = '';
+ if ( $self->{__list}->[$self->{__listnum} - 1] == $UL
+ && $para_lines_ref->[$ind] =~ /^\s*[${bullets}]\s*/)
+ {
+ $tag = $self->get_tag('LI');
+ $para_lines_ref->[$ind] =~ s/^\s*[${bullets}]\s*/${list_indent}${tag}/;
+ $para_action_ref->[$ind] |= $LIST_ITEM;
+ }
+ if ($self->{__list}->[$self->{__listnum} - 1] == $OL)
+ {
+ $tag = $self->get_tag('LI');
+ $para_lines_ref->[$ind] =~ s/^\s*${num_match}.\s*/${list_indent}${tag}/;
+ $para_action_ref->[$ind] |= $LIST_ITEM;
+ }
+ if ( $self->{__list}->[$self->{__listnum} - 1] == $DL
+ && $term)
+ {
+ $tag = $self->get_tag('DT');
+ my $tag2 = $self->get_tag('DT', tag_type => 'end');
+ $term =~ s/_/ /g; # underscores are now spaces in the term
+ $para_lines_ref->[$ind] =~
+ s/^\s*${term_match}.$/${list_indent}${tag}${term}${tag2}/;
+ $tag = $self->get_tag('DD');
+ $para_lines_ref->[$ind] .= ${tag};
+ $para_action_ref->[$ind] |= $LIST_ITEM;
+ }
+ $para_action_ref->[$ind] |= $LIST;
+} # continuelist
+
+sub liststuff ($%)
+{
+ my $self = shift;
+ my %args = (
+ para_lines_ref => undef,
+ para_action_ref => undef,
+ para_line_indent_ref => undef,
+ ind => 0,
+ prev_ref => undef,
+ @_
+ );
+ my $para_lines_ref = $args{para_lines_ref};
+ my $para_action_ref = $args{para_action_ref};
+ my $para_line_indent_ref = $args{para_line_indent_ref};
+ my $ind = $args{ind};
+ my $prev_ref = $args{prev_ref};
+
+ my $i;
+
+ my ($prefix, $number, $rawprefix, $term) =
+ $self->listprefix($para_lines_ref->[$ind]);
+
+ if (!$prefix)
+ {
+ # if the previous line is not blank
+ if ($ind > 0 && $para_lines_ref->[$ind - 1] !~ /^\s*$/)
+ {
+ # inside a list item
+ return;
+ }
+ # This might be a new paragraph within an existing list item;
+ # It will be the first line, and have the same indentation
+ # as the list's indentation.
+ if ( $ind == 0
+ && $self->{__listnum}
+ && $para_line_indent_ref->[$ind] ==
+ $self->{__list_indent}->[$self->{__listnum} - 1])
+ {
+ # start a paragraph
+ my $tag = $self->get_tag('P');
+ ${$prev_ref} .= $tag;
+ $para_action_ref->[$ind] |= $PAR;
+ return;
+ }
+ # This ain't no list. We'll want to end all of them.
+ if ($self->{__listnum})
+ {
+ $self->endlist(
+ num_lists => $self->{__listnum},
+ prev_ref => $prev_ref,
+ line_action_ref => \$para_action_ref->[$ind]
+ );
+ }
+ return;
+ }
+
+ # If numbers with more than one digit grow to the left instead of
+ # to the right, the prefix will shrink and we'll fail to match the
+ # right list. We need to account for this.
+ my $prefix_alternate;
+ if (length("" . $number) > 1)
+ {
+ $prefix_alternate = (" " x (length("" . $number) - 1)) . $prefix;
+ }
+
+ # Maybe we're going back up to a previous list
+ for (
+ $i = $self->{__listnum} - 1;
+ ($i >= 0) && ($prefix ne $self->{__listprefix}->[$i]);
+ $i--
+ )
+ {
+ if (length("" . $number) > 1)
+ {
+ last if $prefix_alternate eq $self->{__listprefix}->[$i];
+ }
+ }
+
+ my $islist;
+
+ # Measure the indent from where the text starts, not where the
+ # prefix starts. This won't screw anything up, and if we don't do
+ # it, the next line might appear to be indented relative to this
+ # line, and get tagged as a new paragraph.
+ my $bullets = $self->{bullets};
+ my $bullets_ordered = $self->{bullets_ordered};
+ my $term_match = $self->{__term_match};
+ my ($total_prefix) =
+ $para_lines_ref->[$ind] =~ /^(\s*[${bullets}${bullets_ordered}\w]+.\s*)/;
+ # a DL indent starts from the edge of the term, plus indent_width
+ if ($term)
+ {
+ ($total_prefix) = $para_lines_ref->[$ind] =~ /^(\s*)${term_match}.$/;
+ $total_prefix .= " " x $self->{indent_width};
+ }
+
+ # Of course, we only use it if it really turns out to be a list.
+
+ $islist = 1;
+ $i++;
+ if (($i > 0) && ($i != $self->{__listnum}))
+ {
+ $self->endlist(
+ num_lists => $self->{__listnum} - $i,
+ prev_ref => $prev_ref,
+ line_action_ref => \$para_action_ref->[$ind]
+ );
+ $islist = 0;
+ }
+ elsif (!$self->{__listnum} || ($i != $self->{__listnum}))
+ {
+ if (
+ ($para_line_indent_ref->[$ind] > 0)
+ || $ind == 0
+ || ($ind > 0 && ($para_lines_ref->[$ind - 1] =~ /^\s*$/))
+ || ( $ind > 0
+ && $para_action_ref->[$ind - 1] & ($BREAK | $HEADER | $CAPS))
+ )
+ {
+ $islist = $self->startlist(
+ prefix => $prefix,
+ number => $number,
+ rawprefix => $rawprefix,
+ term => $term,
+ para_lines_ref => $para_lines_ref,
+ para_action_ref => $para_action_ref,
+ ind => $ind,
+ prev_ref => $prev_ref,
+ total_prefix => $total_prefix
+ );
+ }
+ else
+ {
+
+ # We have something like this: "- foo" which usually
+ # turns out not to be a list.
+ return;
+ }
+ }
+
+ $self->continuelist(
+ para_lines_ref => $para_lines_ref,
+ para_action_ref => $para_action_ref,
+ ind => $ind,
+ term => $term
+ )
+ if ($self->{__mode} & $LIST);
+ $para_line_indent_ref->[$ind] = length($total_prefix) if $islist;
+} # liststuff
+
+# figure out the table type of this table, if any
+sub get_table_type ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $table_type = 0;
+ if ( $self->{table_type}->{DELIM}
+ && $self->is_delim_table(%args))
+ {
+ $table_type = $TAB_DELIM;
+ }
+ elsif ($self->{table_type}->{ALIGN}
+ && $self->is_aligned_table(%args))
+ {
+ $table_type = $TAB_ALIGN;
+ }
+ elsif ($self->{table_type}->{PGSQL}
+ && $self->is_pgsql_table(%args))
+ {
+ $table_type = $TAB_PGSQL;
+ }
+ elsif ($self->{table_type}->{BORDER}
+ && $self->is_border_table(%args))
+ {
+ $table_type = $TAB_BORDER;
+ }
+
+ return $table_type;
+}
+
+# check if the given paragraph-array is an aligned table
+sub is_aligned_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # TABLES: spot and mark up tables. We combine the lines of the
+ # paragraph using the string bitwise or (|) operator, the result
+ # being in $spaces. A character in $spaces is a space only if
+ # there was a space at that position in every line of the
+ # paragraph. $space can be used to search for contiguous spaces
+ # that occur on all lines of the paragraph. If this results in at
+ # least two columns, the paragraph is identified as a table.
+
+ # Note that this sub must be called before checking for preformatted
+ # lines because a table may well have whitespace to the left, in
+ # which case it must not be incorrectly recognised as a preformat.
+ my @rows = @{$rows_ref};
+ my @starts;
+ my $spaces = '';
+ my $max = 0;
+ my $min = $para_len;
+ foreach my $row (@rows)
+ {
+ ($spaces |= $row) =~ tr/ /\xff/c;
+ $min = length $row if length $row < $min;
+ $max = length $row if $max < length $row;
+ }
+ $spaces = substr $spaces, 0, $min;
+ push(@starts, 0) unless $spaces =~ /^ /;
+ while ($spaces =~ /((?:^| ) +)(?=[^ ])/g)
+ {
+ push @starts, pos($spaces);
+ }
+
+ if (2 <= @rows and 2 <= @starts)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+sub is_pgsql_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # a PGSQL table can start with an optional table-caption,
+ # then it has a row of column headings separated by |
+ # then it has a row of ------+-----
+ # then it has one or more rows of column values separated by |
+ # then it has a row-count (N rows)
+ # Thus it must have at least 4 rows.
+ if (@{$rows_ref} < 4)
+ {
+ return 0;
+ }
+
+ my @rows = @{$rows_ref};
+ if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
+ {
+ shift @rows;
+ }
+ if (@rows < 4)
+ {
+ return 0;
+ }
+ if ($rows[0] !~ /^\s*\w+\s+\|\s+/) # Colname |
+ {
+ return 0;
+ }
+ if ($rows[1] !~ /^\s*[-]+[+][-]+/) # ----+----
+ {
+ return 0;
+ }
+ if ($rows[2] !~ /^\s*[^|]*\s+\|\s+/) # value |
+ {
+ return 0;
+ }
+ # check the last row for rowcount
+ if ($rows[$#rows] !~ /\(\d+\s+rows\)/)
+ {
+ return 0;
+ }
+
+ return 1;
+}
+
+sub is_border_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # a BORDER table can start with an optional table-caption,
+ # then it has a row of +------+-----+
+ # then it has a row of column headings separated by |
+ # then it has a row of +------+-----+
+ # then it has one or more rows of column values separated by |
+ # then it has a row of +------+-----+
+ # Thus it must have at least 5 rows.
+ # And note that it could be indented with spaces
+ if (@{$rows_ref} < 5)
+ {
+ return 0;
+ }
+
+ my @rows = @{$rows_ref};
+ if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
+ {
+ shift @rows;
+ }
+ if (@rows < 5)
+ {
+ return 0;
+ }
+ if ($rows[0] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
+ {
+ return 0;
+ }
+ if ($rows[1] !~ /^\s*\|\s*\w+\s+\|\s+.*\|$/) # | Colname |
+ {
+ return 0;
+ }
+ if ($rows[2] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
+ {
+ return 0;
+ }
+ if ($rows[3] !~ /^\s*\|\s*[^|]*\s+\|\s+.*\|$/) # | value |
+ {
+ return 0;
+ }
+ # check the last row for +------+------+
+ if ($rows[$#rows] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
+ {
+ return 0;
+ }
+
+ return 1;
+} # is_border_table
+
+sub is_delim_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # a DELIM table can start with an optional table-caption,
+ # then it has at least two rows which start and end and are
+ # punctuated by a non-alphanumeric delimiter.
+ #
+ # | val1 | val2 |
+ # | val3 | val4 |
+ #
+ # And note that it could be indented with spaces
+ if (@{$rows_ref} < 2)
+ {
+ return 0;
+ }
+
+ my @rows = @{$rows_ref};
+ if ($rows[0] !~ /[^\w\s]/ && $rows[0] =~ /^\s*\w+/) # possible caption
+ {
+ shift @rows;
+ }
+ if (@rows < 2)
+ {
+ return 0;
+ }
+ # figure out if the row starts with a possible delimiter
+ my $delim = '';
+ if ($rows[0] =~ /^\s*([^a-zA-Z0-9])/)
+ {
+ $delim = $1;
+ # have to get rid of ^ and []
+ $delim =~ s/\^//g;
+ $delim =~ s/\[//g;
+ $delim =~ s/\]//g;
+ if (!$delim) # no delimiter after all
+ {
+ return 0;
+ }
+ }
+ else
+ {
+ return 0;
+ }
+ # There needs to be at least three delimiters in the row
+ my @all_delims = ($rows[0] =~ /[${delim}]/g);
+ my $total_num_delims = @all_delims;
+ if ($total_num_delims < 3)
+ {
+ return 0;
+ }
+ # All rows must start and end with the delimiter
+ # and have $total_num_delims number of them
+ foreach my $row (@rows)
+ {
+ if ($row !~ /^\s*[${delim}]/)
+ {
+ return 0;
+ }
+ if ($row !~ /[${delim}]\s*$/)
+ {
+ return 0;
+ }
+ @all_delims = ($row =~ /[${delim}]/g);
+ if (@all_delims != $total_num_delims)
+ {
+ return 0;
+ }
+ }
+
+ return 1;
+} # is_delim_table
+
+sub tablestuff ($%)
+{
+ my $self = shift;
+ my %args = (
+ table_type => 0,
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $table_type = $args{table_type};
+ if ($table_type eq $TAB_ALIGN)
+ {
+ return $self->make_aligned_table(%args);
+ }
+ if ($table_type eq $TAB_PGSQL)
+ {
+ return $self->make_pgsql_table(%args);
+ }
+ if ($table_type eq $TAB_BORDER)
+ {
+ return $self->make_border_table(%args);
+ }
+ if ($table_type eq $TAB_DELIM)
+ {
+ return $self->make_delim_table(%args);
+ }
+} # tablestuff
+
+sub make_aligned_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # TABLES: spot and mark up tables. We combine the lines of the
+ # paragraph using the string bitwise or (|) operator, the result
+ # being in $spaces. A character in $spaces is a space only if
+ # there was a space at that position in every line of the
+ # paragraph. $space can be used to search for contiguous spaces
+ # that occur on all lines of the paragraph. If this results in at
+ # least two columns, the paragraph is identified as a table.
+
+ # Note that this sub must be called before checking for preformatted
+ # lines because a table may well have whitespace to the left, in
+ # which case it must not be incorrectly recognised as a preformat.
+ my @rows = @{$rows_ref};
+ my @starts;
+ my @ends;
+ my $spaces;
+ my $max = 0;
+ my $min = $para_len;
+ foreach my $row (@rows)
+ {
+ ($spaces |= $row) =~ tr/ /\xff/c;
+ $min = length $row if length $row < $min;
+ $max = length $row if $max < length $row;
+ }
+ $spaces = substr $spaces, 0, $min;
+ push(@starts, 0) unless $spaces =~ /^ /;
+ while ($spaces =~ /((?:^| ) +)(?=[^ ])/g)
+ {
+ push @ends, pos($spaces) - length $1;
+ push @starts, pos($spaces);
+ }
+ shift(@ends) if $spaces =~ /^ /;
+ push(@ends, $max);
+
+ # Two or more rows and two or more columns indicate a table.
+ if (2 <= @rows and 2 <= @starts)
+ {
+ $self->{__mode} |= $TABLE;
+
+ # For each column, guess whether it should be left, centre or
+ # right aligned by examining all cells in that column for space
+ # to the left or the right. A simple majority among those cells
+ # that actually have space to one side or another decides (if no
+ # alignment gets a majority, left alignment wins by default).
+ my @align;
+ my $cell = '';
+ foreach my $col (0 .. $#starts)
+ {
+ my @count = (0, 0, 0, 0);
+ foreach my $row (@rows)
+ {
+ my $width = $ends[$col] - $starts[$col];
+ $cell = substr $row, $starts[$col], $width;
+ ++$count[($cell =~ /^ / ? 2 : 0) +
+ ($cell =~ / $/ || length($cell) < $width ? 1 : 0)];
+ }
+ $align[$col] = 0;
+ my $population = $count[1] + $count[2] + $count[3];
+ foreach (1 .. 3)
+ {
+ if ($count[$_] * 2 > $population)
+ {
+ $align[$col] = $_;
+ last;
+ }
+ }
+ }
+
+ foreach my $row (@rows)
+ {
+ $row = join '', $self->get_tag('TR'), (
+ map {
+ $cell = substr $row, $starts[$_], $ends[$_] - $starts[$_];
+ $cell =~ s/^ +//;
+ $cell =~ s/ +$//;
+
+ if ($self->{escape_HTML_chars})
+ {
+ $cell = escape($cell);
+ }
+
+ (
+ $self->get_tag(
+ 'TD',
+ inside_tag => (
+ $self->{xhtml} ? $xhtml_alignments[$align[$_]]
+ : (
+ $self->{lower_case_tags}
+ ? $lc_alignments[$align[$_]]
+ : $alignments[$align[$_]]
+ )
+ )
+ ),
+ $cell,
+ $self->close_tag('TD')
+ );
+ } 0 .. $#starts
+ ),
+ $self->close_tag('TR');
+ }
+
+ # put the <TABLE> around the rows
+ my $tag;
+ if ($self->{xhtml})
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' summary=""');
+ }
+ else
+ {
+ $tag = $self->get_tag('TABLE');
+ }
+ $rows[0] = join("\n", $tag, $rows[0]);
+ $tag = $self->close_tag('TABLE', tag_type => 'end');
+ $rows[$#rows] .= "\n${tag}";
+ @{$rows_ref} = @rows;
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+} # make_aligned_table
+
+sub make_pgsql_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # a PGSQL table can start with an optional table-caption,
+ # then it has a row of column headings separated by |
+ # then it has a row of ------+-----
+ # then it has one or more rows of column values separated by |
+ # then it has a row-count (N rows)
+ # Thus it must have at least 4 rows.
+ my @rows = @{$rows_ref};
+ my $caption = '';
+ if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
+ {
+ $caption = shift @rows;
+ }
+ my @headings = split(/\s+\|\s+/, shift @rows);
+ # skip the ----+--- line
+ shift @rows;
+ # grab the N rows line
+ my $n_rows = pop @rows;
+
+ # now start making the table
+ my @tab_lines = ();
+ my $tag;
+ my $tag2;
+ if ($self->{xhtml})
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' border="1" summary=""');
+ }
+ else
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' border="1"');
+ }
+ push @tab_lines, "$tag\n";
+ if ($caption)
+ {
+ $caption =~ s/^\s+//;
+ $caption =~ s/\s+$//;
+ $tag = $self->get_tag('CAPTION');
+ $tag2 = $self->close_tag('CAPTION');
+ $caption = join('', $tag, $caption, $tag2, "\n");
+ push @tab_lines, $caption;
+ }
+ # table header
+ my $thead = '';
+ $tag = $self->get_tag('THEAD');
+ $thead .= $tag;
+ $tag = $self->get_tag('TR');
+ $thead .= $tag;
+ foreach my $col (@headings)
+ {
+ $col =~ s/^\s+//;
+ $col =~ s/\s+$//;
+ $tag = $self->get_tag('TH');
+ $tag2 = $self->close_tag('TH');
+ $thead .= join('', $tag, $col, $tag2);
+ }
+ $tag = $self->close_tag('TR');
+ $thead .= $tag;
+ $tag = $self->close_tag('THEAD');
+ $thead .= $tag;
+ push @tab_lines, "${thead}\n";
+ $tag = $self->get_tag('TBODY');
+ push @tab_lines, "$tag\n";
+
+ # each row
+ foreach my $row (@rows)
+ {
+ my $this_row = '';
+ $tag = $self->get_tag('TR');
+ $this_row .= $tag;
+ my @cols = split(/\|/, $row);
+ foreach my $cell (@cols)
+ {
+ $cell =~ s/^\s+//;
+ $cell =~ s/\s+$//;
+ if ($self->{escape_HTML_chars})
+ {
+ $cell = escape($cell);
+ }
+ if (!$cell)
+ {
+ $cell = '&nbsp;';
+ }
+ $tag = $self->get_tag('TD');
+ $tag2 = $self->close_tag('TD');
+ $this_row .= join('', $tag, $cell, $tag2);
+ }
+ $tag = $self->close_tag('TR');
+ $this_row .= $tag;
+ push @tab_lines, "${this_row}\n";
+ }
+
+ # end the table
+ $tag = $self->close_tag('TBODY');
+ push @tab_lines, "$tag\n";
+ $tag = $self->get_tag('TABLE', tag_type => 'end');
+ push @tab_lines, "$tag\n";
+
+ # and add the N rows line
+ $tag = $self->get_tag('P');
+ push @tab_lines, "${tag}${n_rows}\n";
+ if ($self->{xhtml})
+ {
+ $tag = $self->get_tag('P', tag_type => 'end');
+ $tab_lines[$#tab_lines] =~ s/\n/${tag}\n/;
+ }
+
+ # replace the rows
+ @{$rows_ref} = @tab_lines;
+} # make_pgsql_table
+
+sub make_border_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # a BORDER table can start with an optional table-caption,
+ # then it has a row of +------+-----+
+ # then it has a row of column headings separated by |
+ # then it has a row of +------+-----+
+ # then it has one or more rows of column values separated by |
+ # then it has a row of +------+-----+
+ my @rows = @{$rows_ref};
+ my $caption = '';
+ if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
+ {
+ $caption = shift @rows;
+ }
+ # skip the +----+---+ line
+ shift @rows;
+ # get the head row and cut off the start and end |
+ my $head_row = shift @rows;
+ $head_row =~ s/^\s*\|//;
+ $head_row =~ s/\|$//;
+ my @headings = split(/\s+\|\s+/, $head_row);
+ # skip the +----+---+ line
+ shift @rows;
+ # skip the last +----+---+ line
+ pop @rows;
+
+ # now start making the table
+ my @tab_lines = ();
+ my $tag;
+ if ($self->{xhtml})
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' border="1" summary=""');
+ }
+ else
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' border="1"');
+ }
+ push @tab_lines, "$tag\n";
+ if ($caption)
+ {
+ $caption =~ s/^\s+//;
+ $caption =~ s/\s+$//;
+ $tag = $self->get_tag('CAPTION');
+ $caption = $tag . $caption;
+ $tag = $self->close_tag('CAPTION');
+ $caption .= $tag;
+ push @tab_lines, "$caption\n";
+ }
+ # table header
+ my $thead = '';
+ $tag = $self->get_tag('THEAD');
+ $thead .= $tag;
+ $tag = $self->get_tag('TR');
+ $thead .= $tag;
+ foreach my $col (@headings)
+ {
+ $col =~ s/^\s+//;
+ $col =~ s/\s+$//;
+ $tag = $self->get_tag('TH');
+ $thead .= $tag;
+ $thead .= $col;
+ $tag = $self->close_tag('TH');
+ $thead .= $tag;
+ }
+ $tag = $self->close_tag('TR');
+ $thead .= $tag;
+ $tag = $self->close_tag('THEAD');
+ $thead .= $tag;
+ push @tab_lines, "${thead}\n";
+ $tag = $self->get_tag('TBODY');
+ push @tab_lines, "$tag\n";
+
+ # each row
+ foreach my $row (@rows)
+ {
+ # cut off the start and end |
+ $row =~ s/^\s*\|//;
+ $row =~ s/\|$//;
+ my $this_row = '';
+ $tag = $self->get_tag('TR');
+ $this_row .= $tag;
+ my @cols = split(/\|/, $row);
+ foreach my $cell (@cols)
+ {
+ $cell =~ s/^\s+//;
+ $cell =~ s/\s+$//;
+ if ($self->{escape_HTML_chars})
+ {
+ $cell = escape($cell);
+ }
+ if (!$cell)
+ {
+ $cell = '&nbsp;';
+ }
+ $tag = $self->get_tag('TD');
+ $this_row .= $tag;
+ $this_row .= $cell;
+ $tag = $self->close_tag('TD');
+ $this_row .= $tag;
+ }
+ $tag = $self->close_tag('TR');
+ $this_row .= $tag;
+ push @tab_lines, "${this_row}\n";
+ }
+
+ # end the table
+ $tag = $self->close_tag('TBODY');
+ push @tab_lines, "$tag\n";
+ $tag = $self->get_tag('TABLE', tag_type => 'end');
+ push @tab_lines, "$tag\n";
+
+ # replace the rows
+ @{$rows_ref} = @tab_lines;
+} # make_border_table
+
+sub make_delim_table ($%)
+{
+ my $self = shift;
+ my %args = (
+ rows_ref => undef,
+ para_len => 0,
+ @_
+ );
+ my $rows_ref = $args{rows_ref};
+ my $para_len = $args{para_len};
+
+ # a DELIM table can start with an optional table-caption,
+ # then it has at least two rows which start and end and are
+ # punctuated by a non-alphanumeric delimiter.
+ # A DELIM table has no table-header.
+ my @rows = @{$rows_ref};
+ my $caption = '';
+ if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
+ {
+ $caption = shift @rows;
+ }
+ # figure out the delimiter
+ my $delim = '';
+ if ($rows[0] =~ /^\s*([^a-zA-Z0-9])/)
+ {
+ $delim = $1;
+ }
+ else
+ {
+ return 0;
+ }
+
+ # now start making the table
+ my @tab_lines = ();
+ my $tag;
+ if ($self->{xhtml})
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' border="1" summary=""');
+ }
+ else
+ {
+ $tag = $self->get_tag('TABLE', inside_tag => ' border="1"');
+ }
+ push @tab_lines, "$tag\n";
+ if ($caption)
+ {
+ $caption =~ s/^\s+//;
+ $caption =~ s/\s+$//;
+ $tag = $self->get_tag('CAPTION');
+ $caption = $tag . $caption;
+ $tag = $self->close_tag('CAPTION');
+ $caption .= $tag;
+ push @tab_lines, "$caption\n";
+ }
+
+ # each row
+ foreach my $row (@rows)
+ {
+ # cut off the start and end delimiter
+ $row =~ s/^\s*[${delim}]//;
+ $row =~ s/[${delim}]$//;
+ my $this_row = '';
+ $tag = $self->get_tag('TR');
+ $this_row .= $tag;
+ my @cols = split(/[${delim}]/, $row);
+ foreach my $cell (@cols)
+ {
+ $cell =~ s/^\s+//;
+ $cell =~ s/\s+$//;
+ if ($self->{escape_HTML_chars})
+ {
+ $cell = escape($cell);
+ }
+ if (!$cell)
+ {
+ $cell = '&nbsp;';
+ }
+ $tag = $self->get_tag('TD');
+ $this_row .= $tag;
+ $this_row .= $cell;
+ $tag = $self->close_tag('TD');
+ $this_row .= $tag;
+ }
+ $tag = $self->close_tag('TR');
+ $this_row .= $tag;
+ push @tab_lines, "${this_row}\n";
+ }
+
+ # end the table
+ $tag = $self->get_tag('TABLE', tag_type => 'end');
+ push @tab_lines, "$tag\n";
+
+ # replace the rows
+ @{$rows_ref} = @tab_lines;
+} # make_delim_table
+
+# Returns true if the passed string is considered to be preformatted
+sub is_preformatted ($$)
+{
+ my $self = shift;
+ my $line = shift;
+
+ my $pre_white_min = $self->{preformat_whitespace_min};
+ my $result = (
+ ($line =~ /\s{$pre_white_min,}\S+/o) # whitespaces
+ || ($line =~ /\.{$pre_white_min,}\S+/o)
+ ); # dots
+ return $result;
+}
+
+# modifies the given string,
+# and returns the front preformatted part
+sub split_end_explicit_preformat ($%)
+{
+ my $self = shift;
+ my %args = (
+ para_ref => undef,
+ @_
+ );
+ my $para_ref = $args{para_ref};
+
+ my $tag = '';
+ my $pre_str = '';
+ my $post_str = '';
+ if ($self->{__mode} & $PRE_EXPLICIT)
+ {
+ my $pe_mark = $self->{preformat_end_marker};
+ if (${para_ref} =~ /$pe_mark/io)
+ {
+ ($pre_str, $post_str) = split(/$pe_mark/, ${$para_ref}, 2);
+ if ($self->{escape_HTML_chars})
+ {
+ $pre_str = escape($pre_str);
+ }
+ $tag = $self->close_tag('PRE');
+ $pre_str .= "${tag}\n";
+ $self->{__mode} ^= (($PRE | $PRE_EXPLICIT) & $self->{__mode});
+ }
+ else # no end -- the whole thing is preformatted
+ {
+ $pre_str = ${$para_ref};
+ if ($self->{escape_HTML_chars})
+ {
+ $pre_str = escape($pre_str);
+ }
+ ${$para_ref} = '';
+ }
+ }
+ return $pre_str;
+} # split_end_explicit_preformat
+
+sub endpreformat ($%)
+{
+ my $self = shift;
+ my %args = (
+ para_lines_ref => undef,
+ para_action_ref => undef,
+ ind => 0,
+ prev_ref => undef,
+ @_
+ );
+ my $para_lines_ref = $args{para_lines_ref};
+ my $para_action_ref = $args{para_action_ref};
+ my $ind = $args{ind};
+ my $prev_ref = $args{prev_ref};
+
+ my $tag = '';
+ if ($self->{__mode} & $PRE_EXPLICIT)
+ {
+ my $pe_mark = $self->{preformat_end_marker};
+ if ($para_lines_ref->[$ind] =~ /$pe_mark/io)
+ {
+ if ($ind == 0)
+ {
+ $tag = $self->close_tag('PRE');
+ $para_lines_ref->[$ind] = "${tag}\n";
+ }
+ else
+ {
+ $tag = $self->close_tag('PRE');
+ $para_lines_ref->[$ind - 1] .= "${tag}\n";
+ $para_lines_ref->[$ind] = "";
+ }
+ $self->{__mode} ^= (($PRE | $PRE_EXPLICIT) & $self->{__mode});
+ $para_action_ref->[$ind] |= $END;
+ }
+ return;
+ }
+
+ if (
+ !$self->is_preformatted($para_lines_ref->[$ind])
+ && (
+ $self->{endpreformat_trigger_lines} == 1
+ || ($ind + 1 < @{$para_lines_ref}
+ && !$self->is_preformatted($para_lines_ref->[$ind + 1]))
+ || $ind + 1 >= @{$para_lines_ref} # last line of para
+ )
+ )
+ {
+ if ($ind == 0)
+ {
+ $tag = $self->close_tag('PRE');
+ ${$prev_ref} = "${tag}\n";
+ }
+ else
+ {
+ $tag = $self->close_tag('PRE');
+ $para_lines_ref->[$ind - 1] .= "${tag}\n";
+ }
+ $self->{__mode} ^= ($PRE & $self->{__mode});
+ $para_action_ref->[$ind] |= $END;
+ }
+} # endpreformat
+
+sub preformat ($%)
+{
+ my $self = shift;
+ my %args = (
+ mode_ref => undef,
+ line_ref => undef,
+ line_action_ref => undef,
+ prev_ref => undef,
+ next_ref => undef,
+ prev_action_ref => undef,
+ @_
+ );
+ my $mode_ref = $args{mode_ref};
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+ my $prev_ref = $args{prev_ref};
+ my $next_ref = $args{next_ref};
+ my $prev_action_ref = $args{prev_action_ref};
+
+ my $tag = '';
+ if ($self->{use_preformat_marker})
+ {
+ my $pstart = $self->{preformat_start_marker};
+ if (${$line_ref} =~ /$pstart/io)
+ {
+ if (${$prev_ref} =~ s/<P>$//)
+ {
+ pop @{$self->{__tags}};
+ }
+ $tag =
+ $self->get_tag('PRE', inside_tag => " class='quote_explicit'");
+ ${$line_ref} = "${tag}\n";
+ ${$mode_ref} |= $PRE | $PRE_EXPLICIT;
+ ${$line_action_ref} |= $PRE;
+ return;
+ }
+ }
+
+ if (
+ !(${$line_action_ref} & $MAILQUOTE)
+ && !(${$prev_action_ref} & $MAILQUOTE)
+ && (
+ $self->{preformat_trigger_lines} == 0
+ || (
+ $self->is_preformatted(${$line_ref})
+ && (
+ $self->{preformat_trigger_lines} == 1
+ || (defined $next_ref
+ && $self->is_preformatted(${$next_ref}))
+ )
+ )
+ )
+ )
+ {
+ if (${$prev_ref} =~ s/<P>$//)
+ {
+ pop @{$self->{__tags}};
+ }
+ $tag = $self->get_tag('PRE');
+ ${$line_ref} =~ s/^/${tag}\n/;
+ ${$mode_ref} |= $PRE;
+ ${$line_action_ref} |= $PRE;
+ }
+} # preformat
+
+sub make_new_anchor ($$)
+{
+ my $self = shift;
+ my $heading_level = shift;
+
+ my ($anchor, $i);
+
+ return sprintf("%d", $self->{__non_header_anchor}++) if (!$heading_level);
+
+ $anchor = "section";
+ $self->{__heading_count}->[$heading_level - 1]++;
+
+ # Reset lower order counters
+ for ($i = @{$self->{__heading_count}}; $i > $heading_level; $i--)
+ {
+ $self->{__heading_count}->[$i - 1] = 0;
+ }
+
+ for ($i = 0; $i < $heading_level; $i++)
+ {
+ $self->{__heading_count}->[$i] = 1
+ if !$self->{__heading_count}->[$i]; # In case they skip any
+ $anchor .= sprintf("_%d", $self->{__heading_count}->[$i]);
+ }
+ chomp($anchor);
+ $anchor;
+} # make_new_anchor
+
+sub anchor_mail ($$)
+{
+ my $self = shift;
+ my $line_ref = shift;
+
+ if ($self->{make_anchors})
+ {
+ my ($anchor) = $self->make_new_anchor(0);
+ if ($self->{lower_case_tags})
+ {
+ ${$line_ref} =~ s/([^ ]*)/<a name="$anchor">$1<\/a>/;
+ }
+ else
+ {
+ ${$line_ref} =~ s/([^ ]*)/<A NAME="$anchor">$1<\/A>/;
+ }
+ }
+} # anchor_mail
+
+sub anchor_heading ($$$)
+{
+ my $self = shift;
+ my $level = shift;
+ my $line_ref = shift;
+
+ if ($self->{dict_debug} & 8)
+ {
+ print STDERR "anchor_heading: ", ${$line_ref}, "\n";
+ }
+ if ($self->{make_anchors})
+ {
+ my ($anchor) = $self->make_new_anchor($level);
+ if ($self->{lower_case_tags})
+ {
+ ${$line_ref} =~ s/(<h.>)(.*)(<\/h.>)/$1<a name="$anchor">$2<\/a>$3/;
+ }
+ else
+ {
+ ${$line_ref} =~ s/(<H.>)(.*)(<\/H.>)/$1<A NAME="$anchor">$2<\/A>$3/;
+ }
+ }
+ if ($self->{dict_debug} & 8)
+ {
+ print STDERR "anchor_heading(after): ", ${$line_ref}, "\n";
+ }
+} # anchor_heading
+
+sub heading_level ($$)
+{
+ my $self = shift;
+
+ my ($style) = @_;
+ $self->{__heading_styles}->{$style} = ++$self->{__num_heading_styles}
+ if !$self->{__heading_styles}->{$style};
+ $self->{__heading_styles}->{$style};
+} # heading_level
+
+sub is_ul_list_line ($%)
+{
+ my $self = shift;
+ my %args = (
+ line => undef,
+ @_
+ );
+ my $line = $args{line};
+
+ my ($prefix, $number, $rawprefix, $term) = $self->listprefix($line);
+ if ($prefix && !$number)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+sub is_heading ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ next_ref => undef,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $next_ref = $args{next_ref};
+
+ if ( ${$line_ref} !~ /^\s*$/
+ && !$self->is_ul_list_line(line => ${$line_ref})
+ && defined $next_ref
+ && ${$next_ref} =~ /^\s*[-=*.~+]+\s*$/)
+ {
+ my ($hoffset, $heading) = ${$line_ref} =~ /^(\s*)(.+)$/;
+ $hoffset = "" unless defined($hoffset);
+ $heading = "" unless defined($heading);
+ # Unescape chars so we get an accurate length
+ $heading =~ s/&[^;]+;/X/g;
+ my ($uoffset, $underline) = ${$next_ref} =~ /^(\s*)(\S+)\s*$/;
+ $uoffset = "" unless defined($uoffset);
+ $underline = "" unless defined($underline);
+ my ($lendiff, $offsetdiff);
+ $lendiff = length($heading) - length($underline);
+ $lendiff *= -1 if $lendiff < 0;
+
+ $offsetdiff = length($hoffset) - length($uoffset);
+ $offsetdiff *= -1 if $offsetdiff < 0;
+ if ( ($lendiff <= $self->{underline_length_tolerance})
+ || ($offsetdiff <= $self->{underline_offset_tolerance}))
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+
+} # is_heading
+
+# make a heading
+# assumes is_heading is true
+sub heading ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ next_ref => undef,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $next_ref = $args{next_ref};
+
+ my ($hoffset, $heading) = ${$line_ref} =~ /^(\s*)(.+)$/;
+ $hoffset = "" unless defined($hoffset);
+ $heading = "" unless defined($heading);
+ $heading =~ s/&[^;]+;/X/g; # Unescape chars so we get an accurate length
+ my ($uoffset, $underline) = ${$next_ref} =~ /^(\s*)(\S+)\s*$/;
+ $uoffset = "" unless defined($uoffset);
+ $underline = "" unless defined($underline);
+
+ $underline = substr($underline, 0, 1);
+
+ # Call it a different style if the heading is in all caps.
+ $underline .= "C" if $self->iscaps(${$line_ref});
+ ${$next_ref} = " "; # Eat the underline
+ $self->{__heading_level} = $self->heading_level($underline);
+ if ($self->{escape_HTML_chars})
+ {
+ ${$line_ref} = escape(${$line_ref});
+ }
+ $self->tagline("H" . $self->{__heading_level}, $line_ref);
+ $self->anchor_heading($self->{__heading_level}, $line_ref);
+} # heading
+
+# check if the given line matches a custom heading
+sub is_custom_heading ($%)
+{
+ my $self = shift;
+ my %args = (
+ line => undef,
+ @_
+ );
+ my $line = $args{line};
+
+ foreach my $reg (@{$self->{custom_heading_regexp}})
+ {
+ return 1 if ($line =~ /$reg/);
+ }
+ return 0;
+} # is_custom_heading
+
+sub custom_heading ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+
+ my $level;
+ my $i = 0;
+ foreach my $reg (@{$self->{custom_heading_regexp}})
+ {
+ if (${$line_ref} =~ /$reg/)
+ {
+ if ($self->{explicit_headings})
+ {
+ $level = $i + 1;
+ }
+ else
+ {
+ $level = $self->heading_level("Cust" . $i);
+ }
+ if ($self->{escape_HTML_chars})
+ {
+ ${$line_ref} = escape(${$line_ref});
+ }
+ $self->tagline("H" . $level, $line_ref);
+ $self->anchor_heading($level, $line_ref);
+ last;
+ }
+ $i++;
+ }
+} # custom_heading
+
+sub unhyphenate_para ($$)
+{
+ my $self = shift;
+ my $para_ref = shift;
+
+ # Treating this whole paragraph as one string, look for
+ # 1 - whitespace
+ # 2 - a word (ending in a hyphen, followed by a newline)
+ # 3 - whitespace (starting on the next line)
+ # 4 - a word with its punctuation
+ # Substitute this with
+ # 1-whitespace 2-word 4-word newline 3-whitespace
+ # We preserve the 3-whitespace because we don't want to mess up
+ # our existing indentation.
+ ${$para_ref} =~
+ /(\s*)([^\W\d_]*)\-\n(\s*)([^\W\d_]+[\)\}\]\.,:;\'\"\>]*\s*)/s;
+ ${$para_ref} =~
+s/(\s*)([^\W\d_]*)\-\n(\s*)([^\W\d_]+[\)\}\]\.,:;\'\"\>]*\s*)/$1$2$4\n$3/gs;
+} # unhyphenate_para
+
+sub tagline ($$$)
+{
+ my $self = shift;
+ my $tag = shift;
+ my $line_ref = shift;
+
+ chomp ${$line_ref}; # Drop newline
+ my $tag1 = $self->get_tag($tag);
+ my $tag2 = $self->close_tag($tag);
+ ${$line_ref} =~ s/^\s*(.*)$/${tag1}$1${tag2}\n/;
+} # tagline
+
+sub iscaps
+{
+ my $self = shift;
+ local ($_) = @_;
+
+ my $min_caps_len = $self->{min_caps_length};
+
+ # This is ugly, but I don't know a better way to do it.
+ # (And, yes, I could use the literal characters instead of the
+ # numeric codes, but this keeps the script 8-bit clean, which will
+ # save someone a big headache when they transfer via ASCII ftp.
+/^[^a-z\341\343\344\352\353\354\363\370\337\373\375\342\345\347\350\355\357\364\365\376\371\377\340\346\351\360\356\361\362\366\372\374<]*[A-Z\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\330\331\332\333\334\335\336]{$min_caps_len,}[^a-z\341\343\344\352\353\354\363\370\337\373\375\342\345\347\350\355\357\364\365\376\371\377\340\346\351\360\356\361\362\366\372\374<]*$/;
+} # iscaps
+
+sub caps
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ line_action_ref => undef,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+
+ if ( $self->{caps_tag}
+ && $self->iscaps(${$line_ref}))
+ {
+ $self->tagline($self->{caps_tag}, $line_ref);
+ ${$line_action_ref} |= $CAPS;
+ }
+} # caps
+
+sub do_delim
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ line_action_ref => undef,
+ delim => '*',
+ tag => 'STRONG',
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+ my $delim = $args{delim};
+ my $tag = $args{tag};
+
+ if ($delim eq '#') # special treatment of # for the #num case
+ {
+ ${$line_ref} =~
+s/#([^0-9#](?![^#]*(?:<li>|<LI>|<P>|<p>))[^#]*[^# \t\n])#/<${tag}>$1<\/${tag}>/gs;
+ ${$line_ref} =~ s/\B#([a-zA-Z])#\B/<${tag}>$1<\/${tag}>/gs;
+ }
+ elsif ($delim eq '^')
+ {
+ ${$line_ref} =~
+s/\^((?![^^]*(?:<li>|<LI>|<p>|<P>))(\w|["'<>])[^^]*)\^/<${tag}>$1<\/${tag}>/gs;
+ ${$line_ref} =~ s/\B\^([a-zA-Z])\^\B/<${tag}>$1<\/${tag}>/gs;
+ }
+ elsif (length($delim) eq 1) # one-character, general
+ {
+ ${$line_ref} =~
+s/(?<![${delim}])[${delim}](?![^${delim}]*(?:<li>|<LI>|<p>|<P>))((\w|["'<>])[^${delim}]*)[${delim}]/<${tag}>$1<\/${tag}>/gs;
+ ${$line_ref} =~
+ s/\B[${delim}]([a-zA-Z])[${delim}]\B/<${tag}>$1<\/${tag}>/gs;
+ }
+ else
+ {
+ ${$line_ref} =~
+s/(?<!${delim})${delim}((\w|["'])(\w|[-\s\.;:,!?"'])*[^\s])${delim}/<${tag}>$1<\/${tag}>/gs;
+ ${$line_ref} =~ s/${delim}]([a-zA-Z])${delim}/<${tag}>$1<\/${tag}>/gs;
+ }
+} # do_delim
+
+# Convert very simple globs to regexps
+sub glob2regexp
+{
+ my ($glob) = @_;
+
+ # Escape funky chars
+ $glob =~ s/[^\w\[\]\*\?\|\\]/\\$&/g;
+ my ($regexp, $i, $len, $escaped) = ("", 0, length($glob), 0);
+
+ for (; $i < $len; $i++)
+ {
+ my $char = substr($glob, $i, 1);
+ if ($escaped)
+ {
+ $escaped = 0;
+ $regexp .= $char;
+ next;
+ }
+ if ($char eq "\\")
+ {
+ $escaped = 1;
+ next;
+ $regexp .= $char;
+ }
+ if ($char eq "?")
+ {
+ $regexp .= ".";
+ next;
+ }
+ if ($char eq "*")
+ {
+ $regexp .= ".*";
+ next;
+ }
+ $regexp .= $char; # Normal character
+ }
+ join('', "\\b", $regexp, "\\b");
+} # glob2regexp
+
+sub add_regexp_to_links_table ($$$$)
+{
+ my $self = shift;
+ my ($key, $URL, $switches) = @_;
+
+ # No sense adding a second one if it's already in there.
+ # It would never get used.
+ if (!$self->{__links_table}->{$key})
+ {
+
+ # Keep track of the order they were added so we can
+ # look for matches in the same order
+ push(@{$self->{__links_table_order}}, ($key));
+
+ $self->{__links_table}->{$key} = $URL; # Put it in The Table
+ $self->{__links_switch_table}->{$key} = $switches;
+ my $ind = @{$self->{__links_table_order}} - 1;
+ print STDERR " (", $ind,
+ ")\tKEY: $key\n\tVALUE: $URL\n\tSWITCHES: $switches\n\n"
+ if ($self->{dict_debug} & 1);
+ }
+ else
+ {
+ if ($self->{dict_debug} & 1)
+ {
+ print STDERR " Skipping entry. Key already in table.\n";
+ print STDERR "\tKEY: $key\n\tVALUE: $URL\n\n";
+ }
+ }
+} # add_regexp_to_links_table
+
+sub add_literal_to_links_table ($$$$)
+{
+ my $self = shift;
+ my ($key, $URL, $switches) = @_;
+
+ $key =~ s/(\W)/\\$1/g; # Escape non-alphanumeric chars
+ $key = "\\b$key\\b"; # Make a regexp out of it
+ $self->add_regexp_to_links_table($key, $URL, $switches);
+} # add_literal_to_links_table
+
+sub add_glob_to_links_table ($$$$)
+{
+ my $self = shift;
+ my ($key, $URL, $switches) = @_;
+
+ $self->add_regexp_to_links_table(glob2regexp($key), $URL, $switches);
+} # add_glob_to_links_table
+
+# Parse the dictionary file.
+# (see also load_dictionary_links, for things that were stripped)
+sub parse_dict ($$$)
+{
+ my $self = shift;
+
+ my ($dictfile, $dict) = @_;
+
+ print STDERR "Parsing dictionary file $dictfile\n"
+ if ($self->{dict_debug} & 1);
+
+ if ($dict =~ /->\s*->/)
+ {
+ my $message = "Two consecutive '->'s found in $dictfile\n";
+ my $near;
+
+ # Print out any useful context so they can find it.
+ ($near) = $dict =~ /([\S ]*\s*->\s*->\s*\S*)/;
+ $message .= "\n$near\n" if $near =~ /\S/;
+ die $message;
+ }
+
+ my ($key, $URL, $switches, $options);
+ while ($dict =~ /\s*(.+)\s+\-+([iehos]+\-+)?\>\s*(.*\S+)\s*\n/ig)
+ {
+ $key = $1;
+ $options = $2;
+ $options = "" unless defined($options);
+ $URL = $3;
+ $switches = 0;
+ # Case insensitivity
+ $switches += $LINK_NOCASE if $options =~ /i/i;
+ # Evaluate as Perl code
+ $switches += $LINK_EVAL if $options =~ /e/i;
+ # provides HTML, not just URL
+ $switches += $LINK_HTML if $options =~ /h/i;
+ # Only do this link once
+ $switches += $LINK_ONCE if $options =~ /o/i;
+ # Only do this link once per section
+ $switches += $LINK_SECT_ONCE if $options =~ /s/i;
+
+ $key =~ s/\s*$//; # Chop trailing whitespace
+
+ if ($key =~ m|^/|) # Regexp
+ {
+ $key = substr($key, 1);
+ $key =~ s|/$||; # Allow them to forget the closing /
+ $self->add_regexp_to_links_table($key, $URL, $switches);
+ }
+ elsif ($key =~ /^\|/) # alternate regexp format
+ {
+ $key = substr($key, 1);
+ $key =~ s/\|$//; # Allow them to forget the closing |
+ $key =~ s|/|\\/|g; # Escape all slashes
+ $self->add_regexp_to_links_table($key, $URL, $switches);
+ }
+ elsif ($key =~ /\"/)
+ {
+ $key = substr($key, 1);
+ $key =~ s/\"$//; # Allow them to forget the closing "
+ $self->add_literal_to_links_table($key, $URL, $switches);
+ }
+ else
+ {
+ $self->add_glob_to_links_table($key, $URL, $switches);
+ }
+ }
+
+} # parse_dict
+
+sub setup_dict_checking ($)
+{
+ my $self = shift;
+
+ # now create the replace funcs and precomile the regexes
+ my ($URL, $switches, $options, $tag1, $tag2);
+ my ($href, $r_sw);
+ my @subs;
+ my $i = 0;
+ foreach my $pattern (@{$self->{__links_table_order}})
+ {
+ $switches = $self->{__links_switch_table}->{$pattern};
+
+ $href = $self->{__links_table}->{$pattern};
+
+ if (!($switches & $LINK_HTML))
+ {
+ $href =~ s#/#\\/#g;
+ $href = (
+ $self->{lower_case_tags}
+ ? join('', '<a href="', $href, '">$&<\\/a>')
+ : join('', '<A HREF="', $href, '">$&<\\/A>')
+ );
+ }
+ else
+ {
+ # change the uppercase tags to lower case
+ if ($self->{lower_case_tags})
+ {
+ $href =~ s#(</)([A-Z]*)(>)#${1}\L${2}${3}#g;
+ $href =~ s/(<)([A-Z]*)(>)/${1}\L${2}${3}/g;
+ # and the anchors
+ $href =~ s/(<)(A\s*HREF)([^>]*>)/$1\L$2$3/g;
+ }
+ $href =~ s#/#\\/#g;
+ }
+
+ $r_sw = "s"; # Options for replacing
+ $r_sw .= "i" if ($switches & $LINK_NOCASE);
+ $r_sw .= "e" if ($switches & $LINK_EVAL);
+
+ # Generate code for replacements.
+ # Create an anonymous subroutine for each replacement,
+ # and store its reference in an array.
+ # We need to do an "eval" to create these because we need to
+ # be able to treat the *contents* of the $href variable
+ # as if it were perl code, because sometimes the $href
+ # contains things which need to be evaluated, such as $& or $1,
+ # not just those cases where we have a "e" switch.
+ my $code = <<EOT;
+\$self->{__repl_code}->[$i] =
+sub {
+my \$al = shift;
+\$al =~ s/$pattern/$href/$r_sw;
+return \$al;
+};
+EOT
+ print STDERR $code if ($self->{dict_debug} & 2);
+ push @subs, $code;
+
+ # compile searching pattern
+ if ($switches & $LINK_NOCASE) # i
+ {
+ $self->{__search_patterns}->[$i] = qr/$pattern/si;
+ }
+ else
+ {
+ $self->{__search_patterns}->[$i] = qr/$pattern/s;
+ }
+ $i++;
+ }
+ # now eval the replacements code string
+ my $codes = join('', @subs);
+ eval "$codes";
+} # setup_dict_checking
+
+sub in_link_context ($$$)
+{
+ my $self = shift;
+ my ($match, $before) = @_;
+ return 1 if $match =~ m@</?A>@i; # No links allowed inside match
+
+ my ($final_open, $final_close);
+ if ($self->{lower_case_tags})
+ {
+ $final_open = rindex($before, "<a ") - $[;
+ $final_close = rindex($before, "</a>") - $[;
+ }
+ else
+ {
+ $final_open = rindex($before, "<A ") - $[;
+ $final_close = rindex($before, "</A>") - $[;
+ }
+
+ return 1 if ($final_open >= 0) # Link opened
+ && (
+ ($final_close < 0) # and not closed or
+ || ($final_open > $final_close)
+ ); # one opened after last close
+
+ # Now check to see if we're inside a tag, matching a tag name,
+ # or attribute name or value
+ $final_open = rindex($before, "<") - $[;
+ $final_close = rindex($before, ">") - $[;
+ ($final_open >= 0) # Tag opened
+ && (
+ ($final_close < 0) # and not closed or
+ || ($final_open > $final_close)
+ ); # one opened after last close
+} # in_link_context
+
+# apply links and formatting to this paragraph
+sub apply_links ($%)
+{
+ my $self = shift;
+ my %args = (
+ para_ref => undef,
+ para_action_ref => undef,
+ @_
+ );
+ my $para_ref = $args{para_ref};
+ my $para_action_ref = $args{para_action_ref};
+
+ if ($self->{make_links}
+ && @{$self->{__links_table_order}})
+ {
+ $self->check_dictionary_links(
+ line_ref => $para_ref,
+ line_action_ref => $para_action_ref
+ );
+ }
+ if ($self->{bold_delimiter})
+ {
+ my $tag = ($self->{lower_case_tags} ? 'strong' : 'STRONG');
+ $self->do_delim(
+ line_ref => $para_ref,
+ line_action_ref => $para_action_ref,
+ delim => $self->{bold_delimiter},
+ tag => $tag
+ );
+ }
+ if ($self->{italic_delimiter})
+ {
+ my $tag = ($self->{lower_case_tags} ? 'em' : 'EM');
+ $self->do_delim(
+ line_ref => $para_ref,
+ line_action_ref => $para_action_ref,
+ delim => $self->{italic_delimiter},
+ tag => $tag
+ );
+ }
+
+} # apply_links
+
+# Check (and alter if need be) the bits in this line matching
+# the patterns in the link dictionary.
+sub check_dictionary_links ($%)
+{
+ my $self = shift;
+ my %args = (
+ line_ref => undef,
+ line_action_ref => undef,
+ @_
+ );
+ my $line_ref = $args{line_ref};
+ my $line_action_ref = $args{line_action_ref};
+
+ my ($switches, $options, $repl_func);
+ my ($linkme, $line_with_links);
+
+ # for each pattern, check and alter the line
+ my $i = 0;
+ foreach my $pattern (@{$self->{__links_table_order}})
+ {
+ $switches = $self->{__links_switch_table}->{$pattern};
+
+ # check the pattern
+ if ($switches & $LINK_ONCE) # Do link only once
+ {
+ $line_with_links = '';
+ if (!$self->{__done_with_link}->[$i]
+ && ${$line_ref} =~ $self->{__search_patterns}->[$i])
+ {
+ $self->{__done_with_link}->[$i] = 1;
+ $line_with_links .= $`;
+ $linkme = $&;
+
+ ${$line_ref} = $';
+ if (!$self->in_link_context($linkme, $line_with_links))
+ {
+ print STDERR "Link rule $i matches $linkme\n"
+ if ($self->{dict_debug} & 4);
+
+ # call the special subroutine already created to do
+ # this replacement
+ $repl_func = $self->{__repl_code}->[$i];
+ $linkme = &$repl_func($linkme);
+ }
+ $line_with_links .= $linkme;
+ }
+ ${$line_ref} = $line_with_links . ${$line_ref};
+ }
+ elsif ($switches & $LINK_SECT_ONCE) # Do link only once per section
+ {
+ $line_with_links = '';
+ if (!$self->{__done_with_sect_link}->[$i]
+ && ${$line_ref} =~ $self->{__search_patterns}->[$i])
+ {
+ $self->{__done_with_sect_link}->[$i] = 1;
+ $line_with_links .= $`;
+ $linkme = $&;
+
+ ${$line_ref} = $';
+ if (!$self->in_link_context($linkme, $line_with_links))
+ {
+ print STDERR "Link rule $i matches $linkme\n"
+ if ($self->{dict_debug} & 4);
+
+ # call the special subroutine already created to do
+ # this replacement
+ $repl_func = $self->{__repl_code}->[$i];
+ $linkme = &$repl_func($linkme);
+ }
+ $line_with_links .= $linkme;
+ }
+ ${$line_ref} = $line_with_links . ${$line_ref};
+ }
+ else
+ {
+ $line_with_links = '';
+ while (${$line_ref} =~ $self->{__search_patterns}->[$i])
+ {
+ $line_with_links .= $`;
+ $linkme = $&;
+
+ ${$line_ref} = $';
+ if (!$self->in_link_context($linkme, $line_with_links))
+ {
+ print STDERR "Link rule $i matches $linkme\n"
+ if ($self->{dict_debug} & 4);
+
+ # call the special subroutine already created to do
+ # this replacement
+ $repl_func = $self->{__repl_code}->[$i];
+ $linkme = &$repl_func($linkme);
+ }
+ $line_with_links .= $linkme;
+ }
+ ${$line_ref} = $line_with_links . ${$line_ref};
+ }
+ $i++;
+ }
+ ${$line_action_ref} |= $LINK;
+} # check_dictionary_links
+
+sub load_dictionary_links ($)
+{
+ my $self = shift;
+
+ @{$self->{__links_table_order}} = ();
+ %{$self->{__links_table}} = ();
+
+ my $dict;
+ foreach $dict (@{$self->{links_dictionaries}})
+ {
+ next unless $dict;
+ open(DICT, "$dict") || die "Can't open Dictionary file $dict\n";
+
+ my @lines = ();
+ while (<DICT>)
+ {
+ # skip lines that start with '#'
+ next if /^\#/;
+ # skip lines that end with unescaped ':'
+ next if /^.*[^\\]:\s*$/;
+ push @lines, $_;
+ }
+ close(DICT);
+ my $contents = join('', @lines);
+ $self->parse_dict($dict, $contents);
+ }
+ # last of all, do the system dictionary, already read in from DATA
+ if ($self->{__global_links_data})
+ {
+ $self->parse_dict("DATA", $self->{__global_links_data});
+ }
+
+ $self->setup_dict_checking();
+} # load_dictionary_links
+
+# do_file_start
+# extra stuff needed for the beginning
+# Args:
+# $self
+# $para
+# Return:
+# processed $para string
+sub do_file_start ($$$)
+{
+ my $self = shift;
+ my $outhandle = shift;
+ my $para = shift;
+
+ if (!$self->{extract})
+ {
+ my @para_lines = split(/\n/, $para);
+ my $first_line = $para_lines[0];
+
+ if ($self->{doctype})
+ {
+ if ($self->{xhtml})
+ {
+ print $outhandle
+'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"',
+ "\n";
+ print $outhandle
+ '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
+ "\n";
+ }
+ else
+ {
+ print $outhandle '<!DOCTYPE HTML PUBLIC "', $self->{doctype},
+ "\">\n";
+ }
+ }
+ print $outhandle $self->get_tag('HTML'), "\n";
+ print $outhandle $self->get_tag('HEAD'), "\n";
+
+ # if --titlefirst is set and --title isn't, use the first line
+ # as the title.
+ if ($self->{titlefirst} && !$self->{title})
+ {
+ my ($tit) = $first_line =~ /^ *(.*)/; # grab first line
+ $tit =~ s/ *$//; # strip trailing whitespace
+ $tit = escape($tit) if $self->{escape_HTML_chars};
+ $self->{'title'} = $tit;
+ }
+ if (!$self->{title})
+ {
+ $self->{'title'} = "";
+ }
+ print $outhandle $self->get_tag('TITLE'), $self->{title},
+ $self->close_tag('TITLE'), "\n";
+
+ if ($self->{append_head})
+ {
+ open(APPEND, $self->{append_head})
+ || die "Failed to open ", $self->{append_head}, "\n";
+ while (<APPEND>)
+ {
+ print $outhandle $_;
+ }
+ close(APPEND);
+ }
+
+ if ($self->{lower_case_tags})
+ {
+ print $outhandle $self->get_tag(
+ 'META',
+ tag_type => 'empty',
+ inside_tag => " name=\"generator\" content=\"$PROG v$VERSION\""
+ ),
+ "\n";
+ }
+ else
+ {
+ print $outhandle $self->get_tag(
+ 'META',
+ tag_type => 'empty',
+ inside_tag => " NAME=\"generator\" CONTENT=\"$PROG v$VERSION\""
+ ),
+ "\n";
+ }
+ if ($self->{style_url})
+ {
+ my $style_url = $self->{style_url};
+ if ($self->{lower_case_tags})
+ {
+ print $outhandle $self->get_tag(
+ 'LINK',
+ tag_type => 'empty',
+ inside_tag =>
+" rel=\"stylesheet\" type=\"text/css\" href=\"$style_url\""
+ ),
+ "\n";
+ }
+ else
+ {
+ print $outhandle $self->get_tag(
+ 'LINK',
+ tag_type => 'empty',
+ inside_tag =>
+" REL=\"stylesheet\" TYPE=\"text/css\" HREF=\"$style_url\""
+ ),
+ "\n";
+ }
+ }
+ print $outhandle $self->close_tag('HEAD'), "\n";
+ if ($self->{body_deco})
+ {
+ print $outhandle $self->get_tag('BODY',
+ inside_tag => $self->{body_deco}), "\n";
+ }
+ else
+ {
+ print $outhandle $self->get_tag('BODY'), "\n";
+ }
+ }
+
+ if ($self->{prepend_file})
+ {
+ if (-r $self->{prepend_file})
+ {
+ open(PREPEND, $self->{prepend_file});
+ while (<PREPEND>)
+ {
+ print $outhandle $_;
+ }
+ close(PREPEND);
+ }
+ else
+ {
+ print STDERR "Can't find or read file ", $self->{prepend_file},
+ " to prepend.\n";
+ }
+ }
+} # do_file_start
+
+# do_init_call
+# certain things, like reading link dictionaries, need to be
+# done once
+sub do_init_call ($)
+{
+ my $self = shift;
+
+ if (!$self->{__call_init_done})
+ {
+ push(@{$self->{links_dictionaries}}, ($self->{default_link_dict}))
+ if ($self->{make_links} && (-f $self->{default_link_dict}));
+ $self->deal_with_options();
+ if ($self->{make_links})
+ {
+ $self->load_dictionary_links();
+ }
+
+ # various initializations
+ $self->{__non_header_anchor} = 0;
+ $self->{__mode} = 0;
+ $self->{__listnum} = 0;
+ $self->{__list_nice_indent} = '';
+ $self->{__list_indent} = [];
+ $self->{__tags} = [];
+
+ $self->{__call_init_done} = 1;
+ }
+} # do_init_call
+
+=head1 FILE FORMATS
+
+There are two files which are used which can affect the outcome of the
+conversion. One is the link dictionary, which contains patterns (of how
+to recognise http links and other things) and how to convert them. The
+other is, naturally, the format of the input file itself.
+
+=head2 Link Dictionary
+
+A link dictionary file contains patterns to match, and what to convert
+them to. It is called a "link" dictionary because it was intended to be
+something which defined what a href link was, but it can be used for
+more than that. However, if you wish to define your own links, it is
+strongly advised to read up on regular expressions (regexes) because
+this relies heavily on them.
+
+The file consists of comments (which are lines starting with #)
+and blank lines, and link entries.
+Each entry consists of a regular expression, a -> separator (with
+optional flags), and a link "result".
+
+In the simplest case, with no flags, the regular expression
+defines the pattern to look for, and the result says what part
+of the regular expression is the actual link, and the link which
+is generated has the href as the link, and the whole matched pattern
+as the visible part of the link. The first character of the regular
+expression is taken to be the separator for the regex, so one
+could either use the traditional / separator, or something else
+such as | (which can be helpful with URLs which are full of / characters).
+
+So, for example, an ftp URL might be defined as:
+
+ |ftp:[\w/\.:+\-]+| -> $&
+
+This takes the whole pattern as the href, and the resultant link
+has the same thing in the href as in the contents of the anchor.
+
+But sometimes the href isn't the whole pattern.
+
+ /&lt;URL:\s*(\S+?)\s*&gt;/ --> $1
+
+With the above regex, a () grouping marks the first subexpression,
+which is represented as $1 (rather than $& the whole expression).
+This entry matches a URL which was marked explicity as a URL
+with the pattern <URL:foo> (note the &lt; is shown as the
+entity, not the actual character. This is because by the
+time the links dictionary is checked, all such things have
+already been converted to their HTML entity forms, unless, of course,
+the escape_HTML_chars option was turned off)
+This would give us a link in the form
+<A HREF="foo">&lt;URL:foo&gt;</A>
+
+B<The h flag>
+
+However, if we want more control over the way the link is constructed,
+we can construct it ourself. If one gives the h flag, then the
+"result" part of the entry is taken not to contain the href part of
+the link, but the whole link.
+
+For example, the entry:
+
+ /&lt;URL:\s*(\S+?)\s*&gt;/ -h-> <A HREF="$1">$1</A>
+
+will take <URL:foo> and give us <A HREF="foo">foo</A>
+
+However, this is a very powerful mechanism, because it
+can be used to construct custom tags which aren't links at all.
+For example, to flag *italicised words* the following
+entry will surround the words with EM tags.
+
+ /\B\*([a-z][a-z -]*[a-z])\*\B/ -hi-> <EM>$1</EM>
+
+B<The i flag>
+
+This turns on ignore case in the pattern matching.
+
+B<The e flag>
+
+This turns on execute in the pattern substitution. This really
+only makes sense if h is turned on too. In that case, the "result"
+part of the entry is taken as perl code to be executed, and the
+result of that code is what replaces the pattern.
+
+B<The o flag>
+
+This marks the entry as a once-only link. This will convert the
+first instance of a matching pattern, and ignore any others
+further on.
+
+For example, the following pattern will take the first mention
+of HTML::TextToHTML and convert it to a link to the module's home page.
+
+ "HTML::TextToHTML" -io-> http://www.katspace.com/tools/text_to_html/
+
+=head2 Input File Format
+
+For the most part, this module tries to use intuitive conventions for
+determining the structure of the text input. Unordered lists are
+marked by bullets; ordered lists are marked by numbers or letters;
+in either case, an increase in indentation marks a sub-list contained
+in the outer list.
+
+Headers (apart from custom headers) are distinguished by "underlines"
+underneath them; headers in all-capitals are distinguished from
+those in mixed case. All headers, both normal and custom headers,
+are expected to start at the first line in a "paragraph".
+
+In other words, the following is a header:
+
+ I am Head Man
+ -------------
+
+But the following does not have a header:
+
+ I am not a head Man, man
+ I am Head Man
+ -------------
+
+Tables require a more rigid convention. A table must be marked as a
+separate paragraph, that is, it must be surrounded by blank lines.
+Tables come in different types. For a table to be parsed, its
+--table_type option must be on, and the --make_tables option must be true.
+
+B<ALIGN Table Type>
+
+Columns must be separated by two or more spaces (this prevents
+accidental incorrect recognition of a paragraph where interword spaces
+happen to line up). If there are two or more rows in a paragraph and
+all rows share the same set of (two or more) columns, the paragraph is
+assumed to be a table. For example
+
+ -e File exists.
+ -z File has zero size.
+ -s File has nonzero size (returns size).
+
+becomes
+
+ <TABLE>
+ <TR><TD>-e</TD><TD>File exists.</TD></TR>
+ <TR><TD>-z</TD><TD>File has zero size.</TD></TR>
+ <TR><TD>-s</TD><TD>File has nonzero size (returns size).</TD></TR>
+ </TABLE>
+
+This guesses for each column whether it is intended to be left,
+centre or right aligned.
+
+B<BORDER Table Type>
+
+This table type has nice borders around it, and will be rendered
+with a border, like so:
+
+ +---------+---------+
+ | Column1 | Column2 |
+ +---------+---------+
+ | val1 | val2 |
+ | val3 | val3 |
+ +---------+---------+
+
+The above becomes
+
+ <TABLE border="1">
+ <THEAD><TR><TH>Column1</TH><TH>Column2</TH></TR></THEAD>
+ <TBODY>
+ <TR><TD>val1</TD><TD>val2</TD></TR>
+ <TR><TD>val3</TD><TD>val3</TD></TR>
+ </TBODY>
+ </TABLE>
+
+It can also have an optional caption at the start.
+
+ My Caption
+ +---------+---------+
+ | Column1 | Column2 |
+ +---------+---------+
+ | val1 | val2 |
+ | val3 | val3 |
+ +---------+---------+
+
+B<PGSQL Table Type>
+
+This format of table is what one gets from the output of a Postgresql
+query.
+
+ Column1 | Column2
+ ---------+---------
+ val1 | val2
+ val3 | val3
+ (2 rows)
+
+This can also have an optional caption at the start.
+This table is also rendered with a border and table-headers like
+the BORDER type.
+
+B<DELIM Table Type>
+
+This table type is delimited by non-alphanumeric characters, and has to
+have at least two rows and two columns before it's recognised as a table.
+
+This one is delimited by the '| character:
+
+ | val1 | val2 |
+ | val3 | val3 |
+
+But one can use almost any suitable character such as : # $ % + and so on.
+This is clever enough to figure out what you are using as the delimiter
+if you have your data set up like a table. Note that the line has to
+both begin and end with the delimiter, as well as using it to separate
+values.
+
+This can also have an optional caption at the start.
+
+=head1 EXAMPLES
+
+ use HTML::TextToHTML;
+
+=head2 Create a new object
+
+ my $conv = new HTML::TextToHTML();
+
+ my $conv = new HTML::TextToHTML(title=>"Wonderful Things",
+ default_link_dict=>$my_link_file,
+ );
+
+=head2 Add further arguments
+
+ $conv->args(short_line_length=>60,
+ preformat_trigger_lines=>4,
+ caps_tag=>"strong",
+ );
+
+=head2 Convert a file
+
+ $conv->txt2html(infile=>[$text_file],
+ outfile=>$html_file,
+ title=>"Wonderful Things",
+ mail=>1
+ );
+
+=head2 Make a pipleline
+
+ open(IN, "ls |") or die "could not open!";
+ $conv->txt2html(inhandle=>[\*IN],
+ outfile=>'-',
+ );
+
+=head1 NOTES
+
+=over
+
+=item *
+
+One cannot use "CLEAR" as a value for the cumulative arguments.
+
+=item *
+
+If the underline used to mark a header is off by more than 1, then
+that part of the text will not be picked up as a header unless you
+change the value of --underline_length_tolerance and/or
+--underline_offset_tolerance. People tend to forget this.
+
+=back
+
+=head1 BUGS
+
+Tell me about them.
+
+=head1 SEE ALSO
+
+perl
+L<txt2html>.
+Data::Dumper
+
+=head1 AUTHOR
+
+ Kathryn Andersen (RUBYKAT)
+ perlkat AT katspace dot com
+ http//www.katspace.com/
+
+based on txt2html by Seth Golub
+
+=head1 COPYRIGHT AND LICENCE
+
+Original txt2html script copyright (c) 1994-2000 Seth Golub <seth AT aigeek.com>
+
+Copyright (c) 2002-2005 by Kathryn Andersen
+
+This program is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+#------------------------------------------------------------------------
+1;
+__DATA__
+#
+# Global links dictionary file for HTML::TextToHTML
+# http://www.katspace.com/tools/text_to_html
+# http://txt2html.sourceforge.net/
+# based on links dictionary for Seth Golub's txt2html
+# http://www.aigeek.com/txt2html/
+#
+# This dictionary contains some patterns for converting obvious URLs,
+# ftp sites, hostnames, email addresses and the like to hrefs.
+#
+# Original adapted from the html.pl package by Oscar Nierstrasz in
+# the Software Archive of the Software Composition Group
+# http://iamwww.unibe.ch/~scg/Src/
+#
+
+# Some people even like to mark the URL label explicitly <URL:foo:label>
+/&lt;URL:([-\w\.\/:~_\@]+):([a-zA-Z0-9'() ]+)&gt;/ -h-> <A HREF="$1">$2</A>
+
+# Some people like to mark URLs explicitly <URL:foo>
+/&lt;URL:\s*(\S+?)\s*&gt;/ -h-> <A HREF="$1">$1</A>
+
+# <http://site>
+/&lt;(http:\S+?)\s*&gt;/ -h-> &lt;<A HREF="$1">$1</A>&gt;
+
+# Urls: <service>:<rest-of-url>
+
+|snews:[\w\.]+| -> $&
+|news:[\w\.]+| -> $&
+|nntp:[\w/\.:+\-]+| -> $&
+|http:[\w/\.:\@+\-~\%#?=&;,]+[\w/]| -> $&
+|shttp:[\w/\.:+\-~\%#?=&;,]+| -> $&
+|https:[\w/\.:+\-~\%#?=&;,]+| -> $&
+|file:[\w/\.:+\-]+| -> $&
+|ftp:[\w/\.:+\-]+| -> $&
+|wais:[\w/\.:+\-]+| -> $&
+|gopher:[\w/\.:+\-]+| -> $&
+|telnet:[\w/\@\.:+\-]+| -> $&
+
+
+# catch some newsgroups to avoid confusion with sites:
+|([^\w\-/\.:\@>])(alt\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(bionet\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(bit\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(biz\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(clari\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(comp\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(gnu\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(humanities\.[\w\.+\-]+[\w+\-]+)|
+ -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(k12\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(misc\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(news\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(rec\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(soc\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(talk\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(us\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(ch\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+|([^\w\-/\.:\@>])(de\.[\w\.+\-]+[\w+\-]+)| -h-> $1<A HREF="news:$2">$2</A>
+
+# FTP locations (with directory):
+# anonymous@<site>:<path>
+|(anonymous\@)([a-zA-Z][\w\.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/\.]+)|
+ -h-> $1<A HREF="ftp://$2/$4">$2:$4</A>$3
+
+# ftp@<site>:<path>
+|(ftp\@)([a-zA-Z][\w\.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/\.]+)|
+ -h-> $1<A HREF="ftp://$2/$4">$2:$4</A>$3
+
+# Email address
+|[a-zA-Z0-9_\+\-\.]+\@([a-zA-Z0-9][\w\.+\-]+\.[a-zA-Z]{2,})|
+ -> mailto:$&
+
+# <site>:<path>
+|([^\w\-/\.:\@>])([a-zA-Z][\w\.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/\.]+)|
+ -h-> $1<A HREF="ftp://$2/$4">$2:$4</A>$3
+
+# NB: don't confuse an http server with a port number for
+# an FTP location!
+# internet number version: <internet-num>:<path>
+|([^\w\-/\.:\@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/\.]+)|
+ -h-> $1<A HREF="ftp://$2/$3">$2:$3</A>
+
+# telnet <site> <port>
+|telnet ([a-zA-Z][\w+\-]+(\.[\w\.+\-]+)+\.[a-zA-Z]{2,})\s+(\d{2,4})|
+ -h-> telnet <A HREF="telnet://$1:$3/">$1 $3</A>
+
+# ftp <site>
+|ftp ([a-zA-Z][\w+\-]+(\.[\w\.+\-]+)+\.[a-zA-Z]{2,})|
+ -h-> ftp <A HREF="ftp://$1/">$1</A>
+
+# host with "ftp" in the machine name
+|\b([a-zA-Z][\w])*ftp[\w]*(\.[\w+\-]+){2,}| -h-> ftp <A HREF="ftp://$&/">$&</A>
+
+# ftp.foo.net/blah/
+|ftp(\.[a-zA-Z0-9_\@:-]+)+/\S+| -> ftp://$&
+
+# www.thehouse.org/txt2html/
+|www(\.[a-zA-Z0-9_\@:-]+)+/\S+| -> http://$&
+
+# host with "www" in the machine name
+|\b([a-zA-Z][\w])*www[\w]*(\.[\w+\-]+){2,}| -> http://$&/
+
+# <site> <port>
+|([a-zA-Z][\w+\-]+\.[\w+\-]+\.[a-zA-Z]{2,})\s+(\d{2,4})|
+ -h-> <A HREF="telnet://$1:$2/">$1 $2</A>
+
+# just internet numbers with port:
+|([^\w\-/\.:\@])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s+(\d{1,4})|
+ -h-> $1<A HREF="telnet://$2:$3">$2 $3</A>
+
+# just internet numbers:
+|([^\w\-/\.:\@])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})|
+ -h-> $1<A HREF="telnet://$2">$2</A>
+
+# RFCs
+/RFC ?(\d+)/ -i-> http://www.cis.ohio-state.edu/rfc/rfc$1.txt
+
+# This would turn "f^H_o^H_o^H_" into "<U>foo</U>". Gross, isn't it?
+# Thanks to Mark O'Dell <emark@cns.caltech.edu> for fixing this.
+#
+# /(.\\010_)+/ -he-> $tmp = $&;$tmp =~ s@\010_@@g;"<U>$tmp</U>"
+# /(_\\010.)+/ -he-> $tmp = $&;$tmp =~ s@_\010@@g;"<U>$tmp</U>"
+# /(.\^H_)+/ -he-> $tmp = $&;$tmp =~ s@\^H_@@g;"<U>$tmp</U>"
+# /(_\^H.)+/ -he-> $tmp = $&;$tmp =~ s@_\^H@@g;"<U>$tmp</U>"
+
+# Mark _underlined stuff_ as <U>underlined stuff</U>
+# Don't mistake variable names for underlines, and
+# take account of possible trailing punctuation
+/([ \t\n])_([a-z][a-z0-9 -]*[a-z])_([ \t\n\.;:,\!\?])/ -hi-> $1<U>$2</U>$3
+
+# Seth and his amazing conversion program :-)
+
+"Seth Golub" -io-> http://www.aigeek.com/
+"txt2html" -io-> http://txt2html.sourceforge.net/
+
+# Kathryn and her amazing modules 8-)
+"Kathryn Andersen" -io-> http://www.katspace.com/
+"HTML::TextToHTML" -io-> http://www.katspace.com/tools/text_to_html/
+"hypertoc" -io-> http://www.katspace.com/tools/hypertoc/
+"HTML::GenToc" -io-> http://www.katspace.com/tools/hypertoc/
+
+# End of global dictionary
+