\n",
@times[0..2];
print qq|\n\n|
} else {
my $s = "$program -- Debug Mode\n";
print $s, '-'x length($s), "\n",
$result, "\n", ' -'x5, "\n",
$xmlckres, "\n";
print ' -'x5, "\nCompiled WML\n", ' -'x5, "\n",
($complres ? "$complres\n" : hextype($binary))
if $options{compile};
print ' -'x5, "\n";
printf "Time: $time wallclock secs (%.2f usr + %.2f sys = %.2f cpu)\n", @times[0..2];
}
} else { ## normal output
my $out = \*STDOUT;
if($options{'output'}) {
open(OUT, ">$options{output}") or fatal "cannot write to '$options{output}': $!\n";
$out = \*OUT;
}
if($options{'compile'}) {
print $out $cgi->header(
-type => 'application/vnd.wap.wmlc',
-content_length => length $result
) if $cgi;
print $out $binary;
} else {
print $out $cgi->header(
-type => "text/vnd.wap.wml; charset=$state{encoding}",
-content_length => length $result
) if $cgi;
print $out $result;
}
}
#
# apply_options()
# -------------
sub apply_options {
if($options{'linearize'}) {
delete @with{qw(table tr td th)};
$with{'caption'} = { action => 'replace', new_value => 'p', render => 'b' };
$with{'tr'} = { action => 'replace', new_value => 'p' };
delete @reverse{qw(table tr td)};
}
if($options{'ignore-images'}) {
delete $with{'img'};
}
if(not defined $options{'debug'}) {
$options{'debug'} = 0;
} elsif($options{'debug'} == 0) {
$options{'debug'} = 1;
}
if($options{'debug'}) {
$options{'xmlcheck'} = 1;
}
if($options{'nopre'}) {
delete $with{'pre'};
$with{'pre'} = { action => 'replace', new_value => 'p' };
$with{'plaintext'} = { action => 'replace', new_value => 'p' };
}
if($cgi) {
$options{'split-card'} = 0;
$options{'split-deck'} = 1;
## security: don't allow to modify the templates
## when called as a CGI
$options{'hreftmpl'} = $defaults{'hreftmpl'};
$options{'srctmpl'} = $defaults{'srctmpl'};
}
## security: check if the templates contains suspicious code
## if the templates have changed
if($options{hreftmpl} ne $defaults{hreftmpl} or $options{srctmpl} ne $defaults{srctmpl}) {
my $forbidden = join '|', '[``]', map {"\\b$_\\b"}
qw(eval exec system unlink kill fork open sysopen pipe socket);
$options{hreftmpl} = $defaults{hreftmpl} if $options{hreftmpl} =~ /$forbidden/;
$options{srctmpl} = $defaults{hreftmpl} if $options{srctmpl} =~ /$forbidden/;
}
$options{'cardsize-limit'} = $options{'max-card-size'} - $options{'card-split-threshold'};
if($^O eq 'MacOS') {
$options{'compile'} = 0;
}
if($options{'compile'}) {
$options{'prev-card-label'} = '[<<]';
}
## proxy support
if($options{'proxy'}) {
if($defaults{'proxy-server'}) {
## use hardcoded settings
$agent->proxy([qw(http ftp gopher)] => $defaults{'proxy-server'});
} else {
## load from environment
$agent->env_proxy();
}
}
## cHTML: there is no "multi-body" in cHTML as there is the
## multi-card mechanism in WML, so we always activate the
## document slicing (split-deck option)
$options{'split-card'} = 0;
$options{'split-deck'} = 1;
}
#
# html2wml()
# --------
sub html2wml {
my $url = shift;
my $file = '';
my $type = '';
my $enc = '';
my $converter = new HTML::Parser api_version => 3;
return unless $url;
## read the file
if($url =~ m{https?://}) { ## absolute uri
($file,$type,$enc) = get_url($url)
} elsif(not $cgi) { ## local file
$file = read_file($url)
} else { ## absolute url relative to the server
($file,$type,$enc) = get_url( $url = URI::URL->new($url, $cgi->url)->abs )
}
$enc ||= '';
$enc =~ s/charset=//i;
url_encode($url);
$state{doc_uri} = $url;
($state{self_srv}) = ($state{self_url} =~ m|^(https?://[\w.-]+(?::\d+)?)/|);
## strip the DOCTYPE
$file =~ s/]+>//go;
## try to get the document charset encoding
if(not $enc and $file =~ m|meta +http-equiv.+charset=["']?([a-zA-Z0-9_-]+)['"]?|i) {
$enc = lc $1
}
$state{encoding} = $enc || $defaults{'encoding'};
$type ||= '';
## if it's an image, call send_image()
if(index($type, 'image') >= 0 or $url =~ /\.(?:gif|jpg|png)$/i) {
@_ = ($file, $url);
goto &send_image
}
## get the document title
if($file =~ m|([^<]+)|i) {
$state{title} = $1;
convert_entities($state{title});
clean_spaces($state{title});
}
## WML header
$state{skip} = 0;
$state{output} = "$defaults{wmlvers}\n";
## affectation of the HTML::Parser handlers
$converter->unbroken_text(1);
$converter->handler(start => \&start_tag, 'tagname, attr');
$converter->handler(end => \&end_tag, 'tagname');
$converter->handler(text => \&text_tag, 'text, is_cdata');
$converter->handler(comment => \&comment_tag, 'tokens');
#$converter->handler(declaration => \&default_handler, 'text');
#$converter->handler(process => \&default_handler, 'text');
#$converter->handler(default => \&default_handler, 'text');
## begin the conversion
$converter->parse($file);
$converter->eof;
## flush the stack
while(my $tag = pop @{$state{stack}}) {
$state{output} .= "$tag>"
}
post_conversion_cleanup();
$state{decks}{$state{cardid}} = $state{output};
return $state{output}
}
#
# post_conversion_cleanup()
# -----------------------
#
sub post_conversion_cleanup {
## convert alone ampersand characters to entities
$state{output} =~ s/\&\s/\& /go;
## correct unclosed numeric entities
$state{output} =~ s/(\\d+)([^\d;])/$1;$2/go;
## convert the named HTML entities to numeric entities
convert_entities($state{output});
## convert non-ASCII characters to numeric entities
if($options{'numeric-non-ascii'}) {
$state{output} =~ s/([\x80-\xFF])/''.ord($1).';'/eg;
}
## escape $ chars
#$state{output} =~ s/\$/\$\$/go; ## not needed in cHTML
collapse($state{output}) if $options{'collapse'};
## set the title of the card
if(length $state{title}) {
my $title = $state{title};
$title =~ s/"/\"/go;
$title =~ s/\$/\$\$/go;
$title =~ s/(\\d+)([^\d;])/$1;$2/go;
$state{output} =~ s/|>|go; ## collapse spaces inside tags
$_[0] =~ s|\s+/>|/>|go; ## collapse spaces inside empty tags
$_[0] =~ s|<(\w+) +|<$1 |g; ## collapse spaces between tag and attributes
$_[0] =~ s|
\s+|
|go; ## collapse spaces at the begining of a paragraph
$_[0] =~ s|\s+
|
|go; ## collapse spaces at the end of a paragraph
## collapse empty paragraphs
$_[0] =~ s|
]*>\s*
||go;
$_[0] =~ s|
]*>\s*(?: )+\s*
||go;
$_[0] =~ s|
]*>\s*(?:\ \s*)+
||go;
$_[0] =~ s|
]*>\s*(?:\ \s*)+
||go;
$_[0] =~ s|
]*>\s*(?:\[IMG\]\s*)+
||go;
$_[0] =~ s|<(\w+)>\s*\1>||go;
## collapse multiple lines
$_[0] =~ s/\n+/\n/go;
$_[0] =~ s/(?: +\n)+/\n/go;
}
#
# get_url()
# -------
# This function gets and returns the file from the given URI.
# If called in a array context, returns the file content and the associated
# MIME type (as given by the server).
#
sub get_url {
my $uri = shift;
my $quiet = shift || 0;
if($cgi and index($uri, 'file:') == 0) {
cgi_error(q|For security reasons, the file: scheme is not allowed.|)
}
my $request = new HTTP::Request GET => $uri;
my $response = $agent->request($request);
if($response->is_error) {
if($response->status_line == 401) {
## Authorization required
my($realm) = ($response->header('WWW-Authenticate') =~ /realm=(.+)/);
my $self = "$state{self_url}url=$state{doc_uri}";
if($options{'http-user'} and $options{'http-passwd'}) {
$request->www_authenticate($response->header('WWW-Authenticate'));
$request->authorization_basic($options{'http-user'}, $options{'http-passwd'});
$response = $agent->request($request);
} else {
if($cgi) {
print $cgi->header(-type => 'text/vnd.wap.wml'), <<"PASSFORM"; exit
$defaults{wmlvers}
Please enter your user name and password for $realm.
User:
Password:
PASSFORM
} else {
fatal <<"PASSASK"
website requires authentication
The web site requires you to authenticate in order to process your request.
Please enter your user name and password for $realm.
Use the --http-user and --http-passwd options (or their short counterparts
-U and -P). Check the documentation for more information.
PASSASK
}
}
} else {
my $err = <<"ERR";
The following error occured while trying to access the following URL
-- $uri --
Error @{[ $response->status_line ]}
ERR
if($cgi) {
if($quiet) {
warning "can't fetch file:\n", $err;
return '';
} else {
cgi_error($err)
}
} else {
fatal "fetch error\n\n", $err
}
}
}
return wantarray ? ($response->content, $response->content_type,
$response->content_encoding) : $response->content
}
#
# read_file()
# ---------
# This function reads and returns the file from the local disk.
#
sub read_file {
my $filepath = shift;
my $quiet = shift || 0;
my $dir = dirname($filepath);
my $file = basename($filepath);
chdir($dir) if $dir;
open(FILE, $file) or my $failed = 1;
if($failed) {
if($quiet) {
warning("can't read file '$file': $!\n") and return ''
} else {
fatal("can't read file '$file': $!\n")
}
}
local $/ = undef;
$file = ;
close(FILE);
return $file
}
#
# send_image()
# ----------
# This function allow Html2Wml to send WBMP images to the client.
# Currently, it send an empty hardcoded image, but support for
# conversion from common formats (GIF, JPEG, PNG) will be added soon.
#
sub send_image {
my $data = shift;
my $path = shift;
my $pixel = pack 'C*', 0, 0, 1, 1, 0xFF; ## this is one white pixel
## TODO: add the code to allow conversion using an external program
print $cgi->header(-type => 'image/wbmp', -content_length => length $pixel), $pixel;
exit
}
#
# convert_entities()
# ----------------
# This function converts the named HTML entities into numeric entities.
#
sub convert_entities {
my $ascii = $options{ascii};
## try to correct unclosed named entities
$_[0] =~ s/(&\w{2,6})\b([^;])/$1;$2/go;
## convert numeric entities and non-ASCII characters
## to ASCII equivalent if requested
if($ascii) {
$_[0] =~ s/(\d+);/$num2ascii{$1}/g;
$_[0] =~ s/([\x80-\xFF])/$num2ascii{ord($1)}/g;
}
my $code = q| while($_[0] =~ /&(\w+);/g) { |
. q| my $ent = $1; |
. q| if(exists $entities{$ent}) { |
.($ascii ? q| my $chr = $entities{$ent}[1]; |
: q| my $chr = ''.$entities{$ent}[0].';'; | )
. q| $_[0] =~ s/&$ent;/$chr/g |
. q| } |
. q| } |;
eval $code;
if($_[0] =~ /&(\w{2,6});?/) {
## there are some residual unknown or incorrect named entities
while($_[0] =~ /&(\w{2,6});?/g) {
my $ent = $1;
## check if $ent is a known entity
if(exists $entities{$ent}) {
warning "unclosed entity: $ent, corrected\n";
my $chr = $ascii ? $entities{$ent}[1] : ''.$entities{$ent}[0].';';
$_[0] =~ s//$chr/;
next
}
my($e1,$e2) = ('','');
## split the entity in two parts and check if the first part
## is a valid entity name
## entities names are between 2 and 6 characters long, so this
## loop won't be executed more than 4 times
for my $i (2..length($ent)) {
$e1 = substr($ent, 0, $i);
$e2 = substr($ent, $i);
last if exists $entities{$e1}
}
if(exists $entities{$e1}) {
warning "unknown entity: $ent, replaced with $e1\n";
my $chr = $ascii ? $entities{$e1}[1] : ''.$entities{$e1}[0].';';
$_[0] =~ s//$chr$e2/;
} else {
warning "unknown entity: $ent\n";
$_[0] =~ s//&$ent/;
}
}
}
## escape the remaining ampersands
$_[0] =~ s/&(\w+[^;])/&$1/g;
}
#
# clean_spaces()
# ------------
sub clean_spaces {
$_[0] =~ s/\t+/ /go;
$_[0] =~ s/^\s+/ /go;
$_[0] =~ s/ +/ /go;
}
#
# HTML::Parser start tag handler
#
sub start_tag {
my($tag, $attr) = @_;
local $_;
return unless exists $with{$tag};
return if $state{skip};
my $curr_tag = ($with{$tag}{action} eq 'replace' ? $with{$tag}{new_value} : $tag);
my $prev_tag = scalar @{$state{stack}} ? ${$state{stack}}[-1] : 0;
## prevent incorrect auto-nesting
return if $curr_tag eq $prev_tag and $with{$curr_tag}{unique} and $with{$curr_tag}{nest} !~ /\b$curr_tag\b/;
## special case: replacing image with its alternative text when necessary
if($curr_tag eq 'img' and $options{'img-alt-text'}) {
my $alt = $attr->{alt} || $attr->{title} || $attr->{id} || $attr->{name} || '[IMG]';
text_tag($alt) and return
}
## special case: tag
if($tag eq 'frame') {
if($prev_tag eq 'p') { pop @{$state{stack}}; $state{output} .= '' }
if($prev_tag eq 'wml') { push @{$state{stack}}, 'card'; $state{output} .= '' }
my $link = xlate_url($$attr{src}, 'href');
$state{output} .= qq|