#!/usr/bin/perl
# ########################################################################### #
# Html2Wml #
# ======== #
# Author: Sebastien Aperghis-Tramoni This is the result of the conversion of the document |,
qq|$state{doc_uri} by $program v$version. Result of XML check: Result of WML compilation: Time: $time wallclock secs (%.2f usr + %.2f sys = %.2f cpu) \s+| |go; ## collapse spaces at the begining of a paragraph
$_[0] =~ s|\s+$program -- Debug Mode
\n|,
qq|
\n|,
htmlize($result),
qq|
\n
\n|,
($complres ? "$complres\n" : hextype($binary)), "
\n"
if $options{compile};
printf "
\n
]*>\s*
||go; $_[0] =~ s|]*>\s*(?:
)+\s*
]*>\s*(?:\ \s*)+
||go; $_[0] =~ s|]*>\s*(?:\ \s*)+
||go; $_[0] =~ s|]*>\s*(?:\[IMG\]\s*)+
||go; $_[0] =~ s|<(\w+)>\s*\1>||go; ## collapse multiple lines $_[0] =~ s/\n+/\n/go; $_[0] =~ s/(?: +\n)+/\n/go; } # # get_url() # ------- # This function gets and returns the file from the given URI. # If called in a array context, returns the file content and the associated # MIME type (as given by the server). # sub get_url { my $uri = shift; my $quiet = shift || 0; if($cgi and index($uri, 'file:') == 0) { cgi_error(q|For security reasons, the file: scheme is not allowed.|) } my $request = new HTTP::Request GET => $uri; my $response = $agent->request($request); if($response->is_error) { if($response->status_line == 401) { ## Authorization required my($realm) = ($response->header('WWW-Authenticate') =~ /realm=(.+)/); my $self = "$state{self_url}url=$state{doc_uri}"; if($options{'http-user'} and $options{'http-passwd'}) { $request->www_authenticate($response->header('WWW-Authenticate')); $request->authorization_basic($options{'http-user'}, $options{'http-passwd'}); $response = $agent->request($request); } else { if($cgi) { print $cgi->header(-type => 'text/vnd.wap.wml'), <<"PASSFORM"; exit $defaults{wmlvers}Please enter your user name and password for $realm.
User:
Password:
Frame: $$attr{name}
|; } ## special case: image map tag if($tag eq 'area') { if($prev_tag eq 'p') { pop @{$state{stack}}; $state{output} .= '' } if($prev_tag eq 'wml') { push @{$state{stack}}, 'card'; $state{output} .= 'Image map: $$attr{href}
|; } ## special case: when inside a don't allow opening tags if($prev_tag eq 'a' and $with{a}{nest} !~ /\b$curr_tag\b/) { return } ## special case: is replaced by";
push @{$state{stack}}, 'p';
}
clean_spaces($text) if $options{'collapse'} and $curr_tag ne 'pre';
#
# TODO: add the code that split too long chunks of text
#
$state{output} .= $text;
$state{cardsize} += length $text;
}
#
# HTML::Parser comment tag handler
#
sub comment_tag {
my($comment) = @_;
local $_;
$comment = join '', @$comment;
#debug [3], "( comment ) stack = (@{$state{stack}})\n $comment\n";
## Actions engine
if($comment =~ /^\s*\[(\w+)\s*(.*)\]\s*$/) {
my $action = $1;
my %attributes = map { /\G(\w+)=["']([^"']+)["']/g } split /\s+/, $2;
for my $attr (keys %attributes) {
if($attr eq 'for') {
return if $attributes{$attr} ne $state{type};
}
if($attr eq 'virtual' and $attributes{virtual} !~ /^http:/) {
$attributes{virtual} = URI::URL->new( $attributes{virtual}, $state{doc_uri} )->abs
}
}
for($action) {
/include/ and do {
my $buf;
if($attributes{virtual}) {
$buf = get_url($attributes{virtual}, 1);
} elsif($attributes{file}) {
$buf = read_file($attributes{file}, 1)
}
$state{output} .= $buf;
$state{cardsize} = length $buf;
};
/skip/ and do {
$state{skip} = 1;
};
/end_skip/ and do {
$state{skip} = 0;
};
/fsize/ and do {
my $buf;
if($attributes{virtual}) {
$buf = length get_url($attributes{virtual}, 1);
} elsif($attributes{file}) {
$buf = length read_file($attributes{file}, 1)
}
$state{output} .= $buf;
$state{cardsize} = length $buf;
};
}
}
}
#
# HTML::Parser default handler
#
sub default_handler {
my($text) = @_;
#debug [2], "( default ) [$text]\n\n";
}
#
# split_card()
# ----------
# This function closes the current card and creates a new one.
#
sub split_card {
my @stack = @{$state{stack}};
shift @stack; ## shift the |,
qq| This program was called with incorrect parameters or an error occured
when processing the request. Please check your request and try again. @_ $program v$version This program was called with incorrect parameters or an error occured
when processing the request. Please check your request and try again. @_ _____$str
"
}
#
# hextype()
# -------
# This function generates a human readable representation of binary data
#
sub hextype {
my $data = shift; ## data to print
my $colwidth = shift || 16; ## width of ASCII column
my $half = $colwidth/2;
my $line = 1;
my $out = '';
while(length $data) {
my @hex = unpack 'C'x$colwidth, substr($data, 0, $colwidth);
substr($data, 0, $colwidth) = '';
$out .= sprintf '%3d: '. ((('%02x 'x$half).' ')x2) .' ', $line++, @hex;
$out .= sprintf ''.('%s'x$half)x2 . "\n", map { $_ > 32 ? chr : '.' } @hex;
}
return $out
}
#
# simple_wrap()
# -----------
# This function wraps the text given in parameter.
#
sub simple_wrap {
my $orig = ref $_[0] ? $_[0] : \$_[0];
my $text = '';
my $curlen = 0;
my $beg = ' 'x5;
my $cols = 75;
while($$orig =~ m/(\s*\S+\s+)/gm) {
if($curlen + length($1) > $cols) {
$text .= "\n$beg$1";
$curlen = 1 + length($beg) + length($1)
} else {
$text .= $1;
$curlen += length $1;
}
$curlen = 0 if index($1, "\n") >= 0;
}
return $text
}
#
# load_entities()
# -------------
#
sub load_entities {
%entities = (
## Special entities
quot => [ 34, '"'],## double quote
quote => [ 34, '"'],## double quote
amp => [ 38, '&'],## ampersand
apos => [ 39, '''],## single quote
lt => [ 60, '<'],## less than sign
gt => [ 62, '>'],## greater than sign
## Spacing characters
nbsp => [ 32, ' '], ## non-breaking space (real value #160)
ensp => [ 32, ' '], ## en space (real value: #8194, U+2002)
emsp => [ 32, ' '], ## em space (real value: #8195, U+2003)
thinsp => [ 32, ' '], ## thin space (real value: #8201, U+2009)
zwnj => [ 0, '' ], ## zero width non-joiner (real value: #8204, U+200C)
zwj => [ 0, '' ], ## zero width joiner (real value: #8205, U+200D)
## Latin Extended-A entities + Mathematical symbols
sbquo => [130, ','], ## single low-9 quotation mark
fnof => [131, 'f'], ## latin small f with hook = florin
bdquo => [132, ',,'], ## double low-9 quotation mark
hellip => [133, '...'], ## horizontal ellipsis
dagger => [134, ' '], ## dagger
Dagger => [135, ' '], ## double dagger
circ => [136, '^'], ## modifier letter circumflex accent
permil => [137, 'o/oo'], ## per mille sign
Scaron => [138, 'S'], ## latin capital letter S with caron
lsaquo => [139, '<'],## single left-pointing angle quotation mark
OElig => [140, 'OE'], ## latin capital ligature OE
lsquo => [145, "'"], ## left single quotation mark
rsquo => [146, "'"], ## right single quotation mark
ldquo => [147, '"'], ## left double quotation mark
rdquo => [148, '"'], ## right double quotation mark
bull => [149, 'o'], ## bullet
ndash => [150, '-'], ## en dash
mdash => [151, '--'], ## em dash
tilde => [152, '~'], ## small tilde
trade => [153, '(tm)'], ## trademark sign
scaron => [154, 's'], ## latin small letter s with caron
rsaquo => [155, '>'],## single right-pointing angle quotation mark
oelig => [156, 'oe'], ## latin small ligature oe
Yuml => [159, 'Y'], ## latin capital letter Y with diaeresis
## ISO-Latin-1 entities
iexcl => [161, '!'],
cent => [162, '-c-'],
pound => [163, '-L-'],
curren => [164, 'CUR'],
yen => [165, 'YEN'],
brvbar => [166, '|'],
sect => [167, 'S:'],
uml => [168, '"'],
copy => [169, '(c)'],
ordf => [170, '-a'],
laquo => [171, '<<'],
'not' => [172, 'NOT'],
shy => [173, '-'],
reg => [174, '(R)'],
macr => [175, '-'],
deg => [176, 'DEG'],
plusmn => [177, '+/-'],
sup2 => [178, '^2'],
sup3 => [179, '^3'],
acute => [180, "'"],
micro => [181, 'u'],
para => [182, 'P:'],
middot => [183, '.'],
cedil => [184, ','],
sup1 => [185, '^1'],
ordm => [186, '-o'],
raquo => [187, '>>'],
frac14 => [188, ' 1/4'],
frac12 => [189, ' 1/2'],
frac34 => [190, ' 3/4'],
iquest => [191, '?'],
Agrave => [192, 'A'],
Aacute => [193, 'A'],
Acirc => [194, 'A'],
Atilde => [195, 'A'],
Auml => [196, 'Ae'],
Aring => [197, 'A'],
AElig => [198, 'AE'],
Ccedil => [199, 'C'],
Egrave => [200, 'E'],
Eacute => [201, 'E'],
Ecirc => [202, 'E'],
Euml => [203, 'E'],
Igrave => [204, 'I'],
Iacute => [205, 'I'],
Icirc => [206, 'I'],
Iuml => [207, 'I'],
ETH => [208, 'DH'],
Ntilde => [209, 'N'],
Ograve => [210, 'O'],
Oacute => [211, 'O'],
Ocirc => [212, 'O'],
Otilde => [213, 'O'],
Ouml => [214, 'Oe'],
'times' => [215, '*'],
Oslash => [216, 'O'],
Ugrave => [217, 'U'],
Uacute => [218, 'U'],
Ucirc => [219, 'U'],
Uuml => [220, 'Ue'],
Yacute => [221, 'Y'],
THORN => [222, 'P'],
szlig => [223, 'ss'],
agrave => [224, 'a'],
aacute => [225, 'a'],
acirc => [226, 'a'],
atilde => [227, 'a'],
auml => [228, 'ae'],
aring => [229, 'a'],
aelig => [230, 'ae'],
ccedil => [231, 'c'],
egrave => [232, 'e'],
eacute => [233, 'e'],
ecirc => [234, 'e'],
euml => [235, 'e'],
igrave => [236, 'i'],
iacute => [237, 'i'],
icirc => [238, 'i'],
iuml => [239, 'i'],
eth => [240, 'e'],
ntilde => [241, 'n'],
ograve => [242, 'o'],
oacute => [243, 'o'],
ocirc => [244, 'o'],
otilde => [245, 'o'],
ouml => [246, 'o'],
divide => [247, '/'],
oslash => [248, 'o'],
ugrave => [249, 'u'],
uacute => [250, 'u'],
ucirc => [251, 'u'],
uuml => [252, 'u'],
yacute => [253, 'y'],
thorn => [254, 'p'],
yuml => [255, 'y'],
);
}
#
# warning()
# -------
sub warning {
print STDERR 'html2wml: warning: ', @_
}
#
# fatal()
# -----
sub fatal {
print STDERR 'html2wml: fatal: ', @_;
exit -1;
}
#
# debug()
# -----
sub debug {
if($options{'debug'}) {
my $level = ref $_[0] ? shift->[0] : 1;
print STDERR @_ if $level <= $options{'debug'}
}
}
#
# version()
# -------
sub version {
print "$program/$version\n"; exit
}
#
# usage()
# -----
sub usage {
print STDERR <<"USAGE"; exit
usage: $0 [options] file [-o output]
options:
-a, --ascii use 7 bits ASCII emulation to convert named entities
--nocollapse don't collapse spaces and empty paragraphs
--hreftmpl=template set the template for the links reconstruction engine
-i, --ignore-images completly ignore image links
--noimg-alt-text don't replace the images by their alternative text
--nolinearize don't linearize the tables
-n, --numeric-non-ascii convert non-ASCII characters to numeric entities
-p, --nopre don't use the tag
--split-card slice the document by cards (default)
--split-deck slice the document by decks
-s, --max-card-size=size set the card size upper limit
-t, --card-split-threshold=size set the card splitting threshold
--next-card-label=label set the label of the link to the next card
--prev-card-label=label set the label of the link to the previous card
-U, --http-user set the HTTP user
-P, --http-passwd set the HTTP password
-Y, --proxy use proxy settings provided by environnement
--noproxy don't use proxy
-k, --compile compile the result in binary form
-o, --output=outfile select the outpout (stdout if none specified)
-d, --debug=n activate the debug mode (always prints to stdout)
-c, --xmlcheck activate the XML well-formedness and validity check
-h, --help show this help screen and exit
-v, --version show the program name and version and exit
Read the documentation for more information.
USAGE
}
#
# cgi_error()
# ---------
sub cgi_error {
if($options{'debug'}) {
print <<"OUTPUT"; exit
Html2Wml - Error
$program v$version