|
[code]#!/usr/bin/perl -T
#
# W3C Markup Validation Service
# A CGI script to retrieve and validate a markup file
#
# Copyright 1995-2007 World Wide Web Consortium, (Massachusetts
# Institute of Technology, European Research Consortium for Informatics
# and Mathematics, Keio University). All Rights Reserved.
#
# Originally written by Gerald Oskoboiny <gerald@w3.org>
# for additional contributors, see http://dev.w3.org/cvsweb/validator/
# and http://validator.w3.org/about.html#credits
#
# This source code is available under the license at:
# http://www.w3.org/Consortium/Legal/copyright-software
#
# $Id: check,v 1.573 2007/10/03 10:42:08 ot Exp $
#
# Disable buffering on STDOUT!
$| = 1;
#
# We need Perl 5.8.0+.
use 5.008;
###############################################################################
#### Load modules. ############################################################
###############################################################################
#
# Pragmas.
use strict;
use warnings;
use utf8;
#
# Modules. See also the BEGIN block further down below.
#
# Version numbers given where we absolutely need a minimum version of a given
# module (gives nicer error messages). By default, add an empty import list
# when loading modules to prevent non-OO or poorly written modules from
# polluting our namespace.
#
use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect);
use CGI::Carp qw(carp croak fatalsToBrowser);
use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder
use Encode qw();
use Encode::Alias qw();
use Encode::HanExtra qw(); # for some chinese character encodings,
# e.g gb18030
use Encode::JIS2K qw(); # ditto extra japanese encodings
use File::Spec qw();
use HTML::Encoding 0.52 qw();
use HTML::Parser 3.25 qw(); # Need 3.25 for $p->ignore_elements.
use HTML::Template 2.6 qw();
use HTTP::Negotiate qw();
use HTTP::Request qw();
use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*.
use SGML::Parser::OpenSP qw();
use URI qw();
use URI::Escape qw(uri_escape);
use XML::LibXML qw();
###############################################################################
#### Constant definitions. ####################################################
###############################################################################
#
# Define global constants
use constant TRUE => 1;
use constant FALSE => 0;
#
# Tentative Validation Severities.
use constant T_WARN => 4; # 0000 0100
use constant T_ERROR => 8; # 0000 1000
#
# Output flags for error processing
use constant O_SOURCE => 1; # 0000 0001
use constant O_CHARSET => 2; # 0000 0010
use constant O_DOCTYPE => 4; # 0000 0100
use constant O_NONE => 8; # 0000 1000
#
# Define global variables.
use vars qw($DEBUG $CFG $RSRC $VERSION);
#
# Things inside BEGIN don't happen on every request in persistent environments
# (such as mod_perl); so let's do the globals, eg. read config, here.
BEGIN {
# Launder data for -T; -AutoLaunder doesn't catch this one.
if (exists $ENV{W3C_VALIDATOR_HOME}) {
$ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/;
$ENV{W3C_VALIDATOR_HOME} = $1;
}
#
# Read Config Files.
eval {
my %config_opts = (
-ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'),
-MergeDuplicateOptions => TRUE,
-MergeDuplicateBlocks => TRUE,
-SplitPolicy => 'equalsign',
-UseApacheInclude => TRUE,
-IncludeRelative => TRUE,
-InterPolateVars => TRUE,
-AutoLaunder => TRUE,
-AutoTrue => TRUE,
-DefaultConfig => {
Protocols => {Allow => 'http,https'},
Paths => {
Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'),
},
},
);
my %cfg = Config::General->new(%config_opts)->getall();
$CFG = \%cfg;
};
if ($@) {
die <<".EOF.";
Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable
or copy conf/* to /etc/w3c/. Make sure that the configuration file and all
included files are readable by the web server user. The error was:\n'$@'
.EOF.
}
#
# Check a filesystem path for existance and "readability".
sub pathcheck (@) {
my %paths = map { $_ => [-d $_, -r _] } @_;
my @_d = grep {not $paths{$_}->[0]} keys %paths;
my @_r = grep {not $paths{$_}->[1]} keys %paths;
return TRUE if (scalar(@_d) + scalar(@_r) == 0);
die <<".EOF." if scalar @_d;
Does not exist or is not a directory: @_d
.EOF.
die <<".EOF." if scalar @_r;
Directory not readable (permission denied): @_r
.EOF.
}
#
# Check paths in config...
# @@FIXME: This does not do a very good job error-message-wise if a path is
# @@FIXME: missing...;
{
my @dirs = ();
push @dirs, $CFG->{Paths}->{Base};
push @dirs, $CFG->{Paths}->{Templates};
push @dirs, $CFG->{Paths}->{SGML}->{Library};
&pathcheck(@dirs);
}
#
# Split allowed protocols into a list.
if (my $allowed = delete($CFG->{Protocols}->{Allow})) {
$CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)];
}
{ # Make types config indexed by FPI.
my $_types = {};
map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} }
keys %{$CFG->{Types}};
$CFG->{Types} = $_types;
}
#
# Change strings to internal constants in MIME type mapping.
for (keys %{$CFG->{MIME}}) {
$CFG->{MIME}->{$_} = 'TBD' unless $CFG->{MIME}->{$_} eq 'SGML'
or $CFG->{MIME}->{$_} eq 'XML';
}
#
# Set debug flag.
if ($CFG->{'Allow Debug'}) {
$DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'};
} else {
$DEBUG = FALSE;
}
#
# Strings
$VERSION = q$Revision: 1.573 $;
$VERSION =~ s/Revision: ([\d\.]+) /$1/;
#
# Use passive FTP by default.
$ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
# Read friendly error message file
my %rsrc = Config::General->new(
-MergeDuplicateBlocks => 1,
-ConfigFile => File::Spec->catfile($CFG->{Paths}->{Templates},
'en_US', 'error_messages.cfg'),
)->getall();
# 'en_US' should be replaced by $lang for lang-neg
# Config::General workarounds for <msg 0> issues:
# http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0022.html
# http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0025.html
# https://rt.cpan.org/Public/Bug/Display.html?id=17852
$rsrc{msg}{0} ||=
delete($rsrc{'msg 0'}) || # < 2.31
{ original => delete($rsrc{msg}{original}), # 2.31
verbose => delete($rsrc{msg}{verbose}),
};
$RSRC = \%rsrc;
} # end of BEGIN block.
#
# Get rid of (possibly insecure) $PATH.
delete $ENV{PATH};
#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid.
#use Data::Dumper qw(Dumper);
#print Dumper($CFG);
#exit;
#@@DEBUG;
###############################################################################
#### Process CGI variables and initialize. ####################################
###############################################################################
#
# Create a new CGI object.
my $q = new CGI;
#
# The data structure that will hold all session data.
# @@FIXME This can't be my() as $File will sooner or
# later be undef and add_warning will cause the script
# to die. our() seems to work but has other problems.
# @@FIXME Apparently, this must be set to {} also,
# otherwise the script might pick up an old object
# after abort_if_error_flagged under mod_perl.
our $File = {};
#################################
# Initialize the datastructure. #
#################################
#
# Charset data (casing policy: lowercase early).
$File->{Charset}->{Use} = ''; # The charset used for validation.
$File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F)
$File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter.
$File->{Charset}->{META} = ''; # From HTML's <meta http-equiv>.
$File->{Charset}->{XML} = ''; # From the XML Declaration.
$File->{Charset}->{Override} = ''; # From CGI/user override.
#
# Misc simple types.
$File->{Mode} = 'SGML'; # Default parse mode is SGML.
# By default, perform validation (we may perform only xml-wf in some cases)
$File->{XMLWF_ONLY} = FALSE;
#
# Listrefs.
$File->{Warnings} = []; # Warnings...
$File->{Namespaces} = []; # Other (non-root) Namespaces.
# By default, doctype-less documents can not be valid
$File->{"DOCTYPEless OK"} = FALSE;
###############################################################################
#### Generate Template for Result. ############################################
###############################################################################
# in case there is no language set up on the server, we'll use english as default:
if (!defined $CFG->{Languages}) {
$CFG->{Languages} = "en";
}
# first we determine the chosen language based on
# 1) lang argument given as parameter (if this language is available)
# 2) HTTP language negotiation between variants available and user-agent choices
# 3) English by default
my $lang = $q->param('lang') ? $q->param('lang') : '';
my @localizations;
my $lang_ok = FALSE;
foreach my $lang_available (split(" ", $CFG->{Languages})) {
if ($lang eq $lang_available) {
$lang_ok = TRUE;
next;
}
}
if (($lang eq '') or (!$lang_ok)) { # use HTTP-based negotiation
$lang = '';
foreach my $lang_available (split(" ", $CFG->{Languages})) {
push @localizations, [$lang_available, 1.000, 'text/html', undef,
'utf-8', $lang_available, undef];
}
$lang = HTTP::Negotiate::choose(\@localizations);
}
# HTTP::Negotiate::choose may return undef
# e.g if sent Accept-Language: en;q=0
$lang = 'en_US' if (!defined($lang));
if ($lang eq "en") {
$lang = 'en_US'; # legacy
}
my %template_defaults = (
die_on_bad_params => FALSE,
cache => TRUE,
);
$File->{Templates}->{Result} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'result.tmpl'),
loop_context_vars => TRUE,
filter => sub {
my $ref = shift;
${$ref} = Encode::decode_utf8(${$ref});
}
);
$File->{Templates}->{Error} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'fatal-error.tmpl'),
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{AuthzReq} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'http_401_authrequired.tmpl'),
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
# templates for alternate output formats
$File->{Templates}->{XML} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'xml_output.tmpl'),
loop_context_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{SOAP} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'soap_output.tmpl'),
loop_context_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{UCN} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'ucn_output.tmpl'),
loop_context_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{SOAPFault} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'soap_fault.tmpl'),
loop_context_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{SOAPDisabled} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'soap_disabled.tmpl'),
loop_context_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{EARLXML} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'earl_xml.tmpl'),
loop_context_vars => TRUE,
global_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{EARLN3} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'earl_n3.tmpl'),
loop_context_vars => TRUE,
global_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{PrefillHTML} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'prefill_html401.tmpl'),
loop_context_vars => TRUE,
global_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{PrefillXHTML} = HTML::Template->new(
%template_defaults,
filename => File::Spec->catfile($CFG->{Paths}->{Templates},
$lang, 'prefill_xhtml10.tmpl'),
loop_context_vars => TRUE,
global_vars => TRUE,
filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});}
);
$File->{Templates}->{Result}->param(cfg_home_page => $CFG->{'Home Page'});
$File->{Templates}->{SOAP}->param(cfg_home_page => $CFG->{'Home Page'});
undef $lang;
undef %template_defaults;
#########################################
# Populate $File->{Opt} -- CGI Options. #
#########################################
#
# Preprocess the CGI parameters.
$q = &prepCGI($File, $q);
#
# Set session switches.
$File->{Opt}->{'Outline'} = $q->param('outline') ? TRUE : FALSE;
$File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE;
$File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE;
$File->{Opt}->{'Show Parsetree'} = $q->param('sp') ? TRUE : FALSE;
$File->{Opt}->{'No Attributes'} = $q->param('noatt') ? TRUE : FALSE;
$File->{Opt}->{'Show ESIS'} = $q->param('esis') ? TRUE : FALSE;
$File->{Opt}->{'Show Errors'} = $q->param('errors') ? TRUE : FALSE;
$File->{Opt}->{'Verbose'} = $q->param('verbose') ? TRUE : FALSE;
$File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE;
$File->{Opt}->{'Debug'} = $q->param('debug') ? TRUE : FALSE;
$File->{Opt}->{'No200'} = $q->param('No200') ? TRUE : FALSE;
$File->{Opt}->{'Charset'} = $q->param('charset') ? lc $q->param('charset'): '';
$File->{Opt}->{'DOCTYPE'} = $q->param('doctype') ? $q->param('doctype') : '';
$File->{Opt}->{'Output'} = $q->param('output') ? $q->param('output') : 'html';
$File->{Opt}->{'Max Errors'} = $q->param('me') ? $q->param('me') : '';
$File->{Opt}->{'Prefill'} = $q->param('prefill') ? TRUE : FALSE;
$File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') ? $q->param('prefill_doctype') : 'html401';
$File->{Opt}->{'Accept Header'} = $q->param('accept') ? $q->param('accept') : '';
$File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') ? $q->param('accept-language') : '';
#
# "Fallback" info for Character Encoding (fbc), Content-Type (fbt),
# and DOCTYPE (fbd). If TRUE, the Override values are treated as
# Fallbacks instead of Overrides.
$File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE;
$File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE;
$File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE;
#
# If ";debug" was given, let it overrule the value from the config file,
# regardless of whether it's "0" or "1" (on or off), but only if config
# allows the debugging options.
if ($CFG->{'Allow Debug'}) {
$DEBUG = $q->param('debug') if defined $q->param('debug');
$File->{Opt}->{Verbose} = TRUE if $DEBUG;
} else {
$DEBUG = FALSE; # The default.
}
&abort_if_error_flagged($File, O_NONE);
#
# Get the file and metadata.
if ($q->param('uploaded_file')) {
$File = &handle_file($q, $File);
} elsif ($q->param('fragment')) {
$File = &handle_frag($q, $File);
} elsif ($q->param('uri')) {
$File = &handle_uri($q, $File);
}
#
# Abort if an error was flagged during initialization.
&abort_if_error_flagged($File, 0);
#
# Get rid of the CGI object.
undef $q;
#
# We don't need STDIN any more, so get rid of it to avoid getting clobbered
# by Apache::Registry's idiotic interference under mod_perl.
untie *STDIN;
###############################################################################
#### Output validation results. ###############################################
###############################################################################
$File = find_encodings($File);
#
# Decide on a charset to use (first part)
#
if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative.
$File->{Charset}->{Use} = $File->{Charset}->{HTTP};
} elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) {
# Act as if $http_charset was 'us-ascii'. (MIME rules)
$File->{Charset}->{Use} = 'us-ascii';
&add_warning('W01', {
W01_upload => $File->{'Is Upload'},
W01_agent => $File->{Server},
W01_ct => $File->{ContentType},
});
} elsif ($File->{Charset}->{XML}) {
$File->{Charset}->{Use} = $File->{Charset}->{XML};
} elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) {
$File->{Charset}->{Use} = 'utf-16';
} elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) {
$File->{Charset}->{Use} = "utf-8";
} elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) {
$File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.)
}
unless ($File->{Charset}->{Use}) {
$File->{Charset}->{Use} = $File->{Charset}->{META};
}
#
# Handle any Fallback or Override for the charset.
if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) {
# charset=foo was given to the CGI and it wasn't "autodetect" or empty.
#
# Extract the user-requested charset from CGI param.
my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2);
$File->{Charset}->{Override} = lc($override);
if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode
unless ($File->{Charset}->{Use}) {
&add_warning('W02', {W02_charset => $File->{Charset}->{Override}});
$File->{Tentative} |= T_ERROR; # Tag it as Invalid.
$File->{Charset}->{Use} = $File->{Charset}->{Override};
}
} else { # charset "hard override" mode
if (! $File->{Charset}->{Use}) { # overriding "nothing"
&add_warning('W04', {W04_charset => $File->{Charset}->{Override}, W04_override => TRUE});
$File->{Tentative} |= T_ERROR;
$File->{Charset}->{Use} = $File->{Charset}->{Override};
}
else { #actually overriding something
# Warn about Override unless it's the same as the real charset...
unless ($File->{Charset}->{Override} eq $File->{Charset}->{Use}) {
&add_warning('W03', {
W03_use => $File->{Charset}->{Use},
W03_opt => $File->{Charset}->{Override},
});
$File->{Tentative} |= T_ERROR;
$File->{Charset}->{Use} = $File->{Charset}->{Override};
}
}
}
}
unless ($File->{Charset}->{Use}) { # No charset given...
&add_warning('W04', {W04_charset => 'UTF-8'});
$File->{Tentative} |= T_ERROR; # Can never be valid.
$File->{Charset}->{Use} = 'utf-8';
}
#
# Abort if an error was flagged while finding the encoding.
&abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE);
#
# Encode alias definitions. This might not be the best
# place for them, feel free to move them elsewhere.
# implicit bidi, but character encoding is the same
Encode::Alias::define_alias('iso-8859-6-i', 'iso-8859-6');
# implicit bidi, but character encoding is the same
Encode::Alias::define_alias('iso-8859-8-i', 'iso-8859-8');
# 0xA0 is U+00A0 in ISO-8859-11 but undefined in tis-620
# other than that the character encodings are equivalent
Encode::Alias::define_alias('tis-620', 'iso-8859-11');
# Encode::Byte does not know 'macintosh' but MacRoman
Encode::Alias::define_alias('macintosh', 'MacRoman');
# x-mac-roman is the non-standard version of 'macintosh'
Encode::Alias::define_alias('x-mac-roman', 'MacRoman');
# Encode only knows the long hand version of 'ksc_5601'
Encode::Alias::define_alias('ksc_5601', 'KS_C_5601-1987');
# gb18030 requires Encode::HanExtra but no additional alias
#
# Always transcode, even if the content claims to be UTF-8
$File = transcode($File);
&abort_if_error_flagged($File, O_CHARSET);
#
# Add a warning if doc is UTF-8 and contains a BOM.
if ($File->{Charset}->{Use} eq 'utf-8' &&
$File->{Content}->[0] =~ m(^\x{FEFF})) {
&add_warning('W21', {});
}
#
# Overall parsing algorithm for documents returned as text/html:
#
# For documents that come to us as text/html,
#
# 1. check if there's a doctype
# 2. if there is a doctype, parse/validate against that DTD
# 3. if no doctype, check for an xmlns= attribute on the first element, or XML declaration
# 4. if no doctype and XML mode, check for XML well-formedness
# 5. otherwise , punt.
#
#
# Override DOCTYPE if user asked for it.
if ($File->{Opt}->{DOCTYPE}
and not $File->{Opt}->{DOCTYPE} =~ /(Inline|detect)/i) {
$File = &override_doctype($File);
}
#
# Try to extract a DOCTYPE or xmlns.
$File = &preparse_doctype($File);
#
# Determine the parse mode (XML or SGML).
##set_parse_mode($File, $CFG) if $File->{DOCTYPE};
set_parse_mode($File, $CFG);
#
# Sanity check Charset information and add any warnings necessary.
$File = &charset_conflicts($File);
# before we start the parsing, clean slate
$File->{'Is Valid'} = TRUE;
$File->{Errors} = [];
# preparse with XML parser if necessary
# we should really be using a SAX ErrorHandler, but I can't find
# a way to make it work with XML::LibXML::SAX::Parser... ** FIXME **
# ditto, we should try using W3C::Validator::SAXHandler,
# but it's badly linked to opensp at the moment
if (&is_xml($File)) {
my $xmlparser = XML::LibXML->new();
$xmlparser->line_numbers(1);
$xmlparser->validation(0);
$xmlparser->load_ext_dtd(0);
# [NOT] loading the XML catalog for entities resolution as it seems to cause a lot of unnecessary DTD/entities fetching (requires >= 1.53 if enabled)
#$xmlparser->load_catalog( File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc') );
my $xml_string = join"\n",@{$File->{Content}};
# the XML parser will check the value of encoding attribute in XML declaration
# so we have to amend it to reflect transcoding. see Bug 4867
$xml_string =~ s/(<\?xml.*)
(encoding[\x20|\x09|\x0D|\x0A]*=[\x20|\x09|\x0D|\x0A]*(?:"[A-Za-z][a-zA-Z0-9_-]+"|'[A-Za-z][a-zA-Z0-9_-]+'))
(.*\?>)/$1encoding="utf-8"$3/sx;
eval {
$xmlparser->parse_string($xml_string);
};
$xml_string = undef;
my $xml_parse_errors_line = undef;
my @xmlwf_error_list;
if ($@) {
my $xmlwf_errors = $@;
my $xmlwf_error_line = undef;
my $xmlwf_error_col = undef;
my $xmlwf_error_msg = undef;
my $got_error_message = 0;
my $got_quoted_line = 0;
my $num_xmlwf_error = 0;
foreach my $msg_line (split "\n", $xmlwf_errors){
$msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g;
$msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{};
# first we get the actual error message
if (($got_error_message eq 0) and ($msg_line =~ /^(:\d+:)( parser error : .*)/ )) {
$xmlwf_error_line = $1;
$xmlwf_error_msg = $2;
$xmlwf_error_line =~ s/:(\d+):/$1/;
$xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /;
$got_error_message = 1;
}
# then we skip the second line, which shows the context (we don't use that)
elsif (($got_error_message eq 1) and ($got_quoted_line eq 0)) {
$got_quoted_line = 1;
}
# we now take the third line, with the pointer to the error's column
elsif (($msg_line =~ /(\s+)\^/) and ($got_error_message eq 1) and ($got_quoted_line eq 1)) {
$xmlwf_error_col = length($1);
}
# cleanup for a number of bugs for the column number
if (defined($xmlwf_error_col)) {
if ((my $l = length($File->{Content}->[$xmlwf_error_line-1])) < $xmlwf_error_col) {
# http://bugzilla.gnome.org/show_bug.cgi?id=434196
#warn("Warning: reported error column larger than line length " .
# "($xmlwf_error_col > $l) in $File->{URI} line " .
# "$xmlwf_error_line, libxml2 bug? Resetting to line length.");
$xmlwf_error_col = $l;
}
elsif ($xmlwf_error_col == 79) {
# working around an apparent odd limitation of libxml
# which only gives context for lines up to 80 chars
# http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420
# http://bugzilla.gnome.org/show_bug.cgi?id=424017
$xmlwf_error_col = "> 80";
# non-int line number will trigger the proper behavior in report_error
}
}
# when we have all the info (one full error message), proceed and move on to the next error
if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){
# Reinitializing for the next batch of 3 lines
$got_error_message = 0;
$got_quoted_line = 0;
# formatting the error message for output
my $err;
$err->{src} = '...'; # do this with show_open_entities()?
$err->{line} = $xmlwf_error_line;
$err->{char} = $xmlwf_error_col;
$err->{num} = 'xmlwf';
$err->{type} = "E";
$err->{msg} = $xmlwf_error_msg;
# The validator will sometimes fail to dereference entities files
# we're filtering the bogus resulting error
if ($err->{msg} =~ /Entity '\w+' not defined/) {
$xmlwf_error_line = undef;
$xmlwf_error_col = undef;
$xmlwf_error_msg = undef;
next;
}
push (@xmlwf_error_list, $err);
$xmlwf_error_line = undef;
$xmlwf_error_col = undef;
$xmlwf_error_msg = undef;
$num_xmlwf_error++;
}
}
foreach my $errmsg (@xmlwf_error_list){
$File->{'Is Valid'} = FALSE;
push @{$File->{WF_Errors}}, $errmsg;
}
}
}
#
# Abandon all hope ye who enter here...
$File = &parse($File);
sub parse (\$) {
my $File = shift;
# TODO switch parser on the fly
my $opensp = SGML::Parser::OpenSP->new();
my $parser_name = "SGML::Parser::OpenSP";
#
# By default, use SGML catalog file and SGML Declaration.
my $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc');
# default parsing options
my @spopt = qw(valid non-sgml-char-ref no-duplicate);
#
# Switch to XML semantics if file is XML.
if (&is_xml($File)) {
$catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc');
push(@spopt, 'xml');
# workaround for a bug in s:p:o 0.99
# see http://www.w3.org/Bugs/Public/show_bug.cgi?id=798#c5
push(@spopt, 'xml');
# FIXME when fixed s:p:o gets released
}
else {
# add warnings for shorttags
push(@spopt, 'min-tag');
# twice, ditto above re: s:p:o bug in 0.99
push(@spopt, 'min-tag');
}
#
# Parser configuration
$opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
$opensp->catalogs($catalog);
$opensp->show_error_numbers(1);
$opensp->warnings(@spopt);
#
# Restricted file reading is disabled on Win32 for the time
# beeing since neither SGML::Parser::OpenSP nor check auto-
# magically set search_dirs to include the temp directory
# so restricted file reading would defunct the Validator.
$opensp->restrict_file_reading(1) unless $^O eq 'MSWin32';
#
# Set debug info for HTML report.
$File->{Templates}->{Result}->param(opt_debug => $DEBUG);
$File->{Templates}->{Result}->param(debug =>
[
map({name => $_, value => $ENV{$_}},
qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)),
{ name => 'Content-Encoding', value => $File->{ContentEnc} },
{ name => 'Content-Language', value => $File->{ContentLang} },
{ name => 'Content-Location', value => $File->{ContentLoc} },
{ name => 'Transfer-Encoding', value => $File->{TransferEnc} },
{ name => 'Parse Mode', value => $File->{Mode} },
{ name => 'Parse Mode Factor', value => $File->{ModeChoice} },
{ name => 'Parser', value => $parser_name },
{ name => 'Parser Options', value => join " ", @spopt },
],
);
$File->{Templates}->{SOAP}->param(opt_debug => $DEBUG);
$File->{Templates}->{SOAP}->param(debug =>
[
map({name => $_, value => $ENV{$_}},
qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)),
{ name => 'Content-Encoding', value => $File->{ContentEnc} },
{ name => 'Content-Language', value => $File->{ContentLang} },
{ name => 'Content-Location', value => $File->{ContentLoc} },
{ name => 'Transfer-Encoding', value => $File->{TransferEnc} },
{ name => 'Parse Mode', value => $File->{Mode} },
{ name => 'Parse Mode Factor', value => $File->{ModeChoice} },
{ name => 'Parser', value => $parser_name },
{ name => 'Parser Options', value => join " ", @spopt },
],
);
my $h = W3C::Validator::SAXHandler->new($opensp, $File);
$opensp->handler($h);
$opensp->parse_string(join"\n",@{$File->{Content}});
# Make sure there are no circular references, otherwise the script
# would leak memory until mod_perl unloads it which could take some
# time. @@FIXME It's probably overly careful though.
$opensp->handler(undef);
undef $h->{_parser};
undef $h->{_file};
undef $h;
undef $opensp;
#
# Set Version to be the FPI initially.
$File->{Version} = $File->{DOCTYPE};
return $File;
}
#
# Force "XML" if type is an XML type and an FPI was not found.
# Otherwise set the type to be the FPI.
if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') {
$File->{Version} = 'XML';
} else {
$File->{Version} = $File->{DOCTYPE} unless $File->{Version};
}
#
# Get the pretty text version of the FPI if a mapping exists.
if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) {
$File->{Version} = $prettyver;
}
#
# check the received mime type against Allowed mime types
if ($File->{ContentType}){
my @allowedMediaType =
split(/\s+/, $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || '');
my $usedCTisAllowed;
if (scalar @allowedMediaType) {
$usedCTisAllowed = FALSE;
foreach (@allowedMediaType) { $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); }
}
else {
# wedon't know what media type is recommended, so better shut up
$usedCTisAllowed = TRUE;
}
if(! $usedCTisAllowed ){
&add_warning('W23', {
W23_type => $File->{ContentType},
W23_type_pref => $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred},
w23_doctype => $File->{Version}
});
}
}
#
# Warn about unknown, incorrect, or missing Namespaces.
if ($File->{Namespace}) {
my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE;
if (&is_xml($File)) {
if ($ns eq $File->{Namespace}) {
&add_warning('W10', {
W10_ns => $File->{Namespace},
W10_type => $File->{Type},
});
}
} else {
&add_warning('W11', {W11_ns => $File->{Namespace}});
}
} else {
if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) {
&add_warning('W12', {});
}
}
## if invalid content, AND if requested, pass through tidy
if ((! $File->{'Is Valid'}) and ($File->{Opt}->{'Show Tidy'}) ) {
eval {
local $SIG{__DIE__};
require HTML::Tidy;
my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}});
$File->{'Tidy'} = Encode::decode('utf-8', $tidy->clean(join"\n",@{$File->{Content}}));
$File->{'Tidy_OK'} = TRUE;
};
if ($@) {
$File->{'Tidy_OK'} = FALSE;
}
}
else {
# if document is valid, we don't really need tidy, do we?
$File->{'Tidy_OK'} = FALSE;
}
if (!$File->{'Tidy_OK'}) {
# if tidy not available, disable
$File->{Opt}->{'Show Tidy'} = FALSE;
}
my $template;
if ($File->{Opt}->{Output} eq 'xml') {
$template = $File->{Templates}->{XML};
} elsif ($File->{Opt}->{Output} eq 'earl') {
$template = $File->{Templates}->{EARLXML};
} elsif ($File->{Opt}->{Output} eq 'n3') {
$template = $File->{Templates}->{EARLN3};
} elsif ($File->{Opt}->{Output} eq 'ucn') {
$template = $File->{Templates}->{UCN};
} elsif ($File->{Opt}->{Output} eq 'soap12') {
if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing validation...
print CGI::header(-status => 503, -content_language => "en",
-type => "text/html", -charset => "utf-8"
);
$template = $File->{Templates}->{SOAPDisabled};
} elsif ($File->{'Error Flagged'}) { # should send SOAP fault message
$template = $File->{Templates}->{SOAPFault};
# we fill the soap fault template
#with the variables that had been passed to the HTML fatal error template
foreach my $fault_param ($File->{Templates}->{Error}->param()) {
$template->param($fault_param => $File->{Templates}->{Error}->param($fault_param));
}
} else {
$template = $File->{Templates}->{SOAP};
}
} else {
$template = $File->{Templates}->{Result};
}
&prep_template($File, $template);
&fin_template($File, $template);
$template->param(file_warnings => $File->{Warnings});
$template->param(tidy_output => $File->{'Tidy'});
$template->param(file_source => &source($File))
if ($template->param('opt_show_source') or ($File->{'Is Upload'}) or ($File->{'Direct Input'}));
#$template->param('opt_show_esis' => TRUE)
# if $File->{Opt}->{'Show ESIS'};
#$template->param('opt_show_raw_errors' => TRUE)
# if $File->{Opt}->{'Show Errors'};
#$template->param('file_raw_errors' => &show_errors($File))
# if $template->param('opt_show_raw_errors');
# $T->param(file_outline => &outline($File)) if $T->param('opt_show_outline');
# transcode output from perl's internal to utf-8 and output
print Encode::encode('UTF-8', $template->output);
#
# Get rid of $File object and exit.
undef $File;
exit;
#############################################################################
# Subroutine definitions
#############################################################################
#
# Generate HTML report.
sub prep_template ($$) {
my $File = shift;
my $T = shift;
#
# XML mode...
$T->param(is_xml => &is_xml($File));
#
# Upload?
$T->param(is_upload => $File->{'Is Upload'});
#
# Direct Input?
$T->param(is_direct_input => $File->{'Direct Input'});
#
# The URI...
$T->param(file_uri => $File->{URI});
$T->param(file_uri_param => uri_escape($File->{URI}));
#
# Set URL for page title.
$T->param(page_title_url => $File->{URI});
#
# Metadata...
$T->param(file_modified => $File->{Modified});
$T->param(file_server => $File->{Server});
$T->param(file_size => $File->{Size});
$T->param(file_contenttype => $File->{ContentType});
$T->param(file_charset => $File->{Charset}->{Use});
$T->param(file_doctype => $File->{DOCTYPE});
#
# Output options...
$T->param(opt_show_source => $File->{Opt}->{'Show Source'});
$T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'});
$T->param(opt_show_outline => $File->{Opt}->{'Outline'});
$T->param(opt_show_parsetree => $File->{Opt}->{'Show Parsetree'});
$T->param(opt_show_noatt => $File->{Opt}->{'No Attributes'});
$T->param(opt_verbose => $File->{Opt}->{'Verbose'});
$T->param(opt_group_errors => $File->{Opt}->{'Group Errors'});
$T->param(opt_no200 => $File->{Opt}->{'No200'});
#
# Tip of the Day...
my $tip = &get_tip();
$T->param(tip_uri => $tip->[0]);
$T->param(tip_slug => $tip->[1]);
# Root Element
$T->param(root_element => $File->{Root});
# Namespaces...
$T->param(file_namespace => $File->{Namespace});
my %seen_ns = ();
my @bulk_ns = @{$File->{Namespaces}};
$File->{Namespaces} = []; # reinitialize the list of non-root namespaces
# ... and then get a uniq version of it
foreach my $single_namespace (@bulk_ns) {
push(@{$File->{Namespaces}}, $single_namespace) unless (($single_namespace eq $File->{Namespace}) or $seen_ns{$single_namespace}++);
}
my @nss = map({uri => $_}, @{$File->{Namespaces}});
$T->param(file_namespaces => \@nss) if @nss;
if ($File->{Opt}->{DOCTYPE}) {
my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}";
$T->param($over_doctype_param => TRUE);
}
if ($File->{Opt}->{Charset}) {
my $over_charset_param = "override charset $File->{Opt}->{Charset}";
$T->param($over_charset_param => TRUE);
}
if ($File->{'Error Flagged'}) {
$T->param(fatal_error => TRUE);
}
}
sub fin_template ($$) {
my $File = shift;
my $T = shift;
if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) {
# @@TODO@@ we should try falling back on other version
# info, such as the ones stored in Version_ESIS
$T->param(file_version => '(no Doctype found)');
}
else {
$T->param(file_version => $File->{Version});
}
my ($num_errors,$num_warnings, $num_info, $reported_errors) = &report_errors($File);
if ($num_errors+$num_warnings > 0)
{
$T->param(has_errors => 1);
}
$T->param(valid_errors_num => $num_errors);
$num_warnings += scalar @{$File->{Warnings}};
$T->param(valid_warnings_num => $num_warnings);
my $number_of_errors = ""; # textual form of $num_errors
my $number_of_warnings = ""; # textual form of $num_errors
# The following is a bit hack-ish, but will enable us to have some logic
# for a human-readable display of the number, with cases for 0, 1, 2 and above
# (the case of 2 appears to be useful for localization in some languages where the plural is different for 2, and above)
if ($num_errors > 1) {
$T->param(number_of_errors_is_0 => FALSE );
$T->param(number_of_errors_is_1 => FALSE);
if ($num_errors eq 2) {
$T->param(number_of_errors_is_2 => TRUE);
}
else {
$T->param(number_of_errors_is_2 => FALSE );
}
$T->param(number_of_errors_is_plural => TRUE );
}
elsif ($num_errors eq 1) {
$T->param(number_of_errors_is_0 => FALSE );
$T->param(number_of_errors_is_1 => TRUE );
$T->param(number_of_errors_is_2 => FALSE );
$T->param(number_of_errors_is_plural => FALSE );
}
else { # 0
$T->param(number_of_errors_is_0 => TRUE );
$T->param(number_of_errors_is_1 => FALSE );
$T->param(number_of_errors_is_2 => FALSE );
$T->param(number_of_errors_is_plural => FALSE );
}
if ($num_warnings > 1) {
$T->param(number_of_warnings_is_0 => FALSE );
$T->param(number_of_warnings_is_1 => FALSE);
if ($num_warnings eq 2) {
$T->param(number_of_warnings_is_2 => TRUE);
}
else {
$T->param(number_of_warnings_is_2 => FALSE);
}
$T->param(number_of_warnings_is_plural => TRUE );
}
elsif ($num_warnings eq 1) {
$T->param(number_of_warnings_is_0 => FALSE );
$T->param(number_of_warnings_is_1 => TRUE );
$T->param(number_of_warnings_is_2 => FALSE );
$T->param(number_of_warnings_is_plural => FALSE );
}
else { # 0
$T->param(number_of_warnings_is_0 => TRUE );
$T->param(number_of_warnings_is_1 => FALSE );
$T->param(number_of_warnings_is_2 => FALSE );
$T->param(number_of_warnings_is_plural => FALSE );
}
$T->param(file_errors => $reported_errors);
if ($File->{'Is Valid'}) {
$T->param(VALID => TRUE);
$T->param(valid_status => 'Valid');
&report_valid($File, $T);
} else {
$T->param(VALID => FALSE);
$T->param(valid_status => 'Invalid');
}
}
#
# Output "This page is Valid" report.
sub report_valid {
my $File = shift;
my $T = shift;
unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) {
if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) {
my $cfg = $CFG->{Types}->{$File->{DOCTYPE}};
$T->param(have_badge => TRUE);
$T->param(badge_uri => $cfg->{Badge}->{URI});
if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'Local URI'}) {
$T->param(local_badge_uri => $cfg->{Badge}->{'Local URI'});
$T->param(have_local_badge => TRUE);
}
if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'ALT URI'}) {
$T->param(badge_alt_uri => $cfg->{Badge}->{'ALT URI'});
if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'Local ALT URI'}) {
$T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'});
}
$T->param(have_alt_badge => TRUE);
}
$T->param(badge_alt => $cfg->{Badge}->{Alt});
$T->param(badge_h => $cfg->{Badge}->{Height});
$T->param(badge_w => $cfg->{Badge}->{Width});
$T->param(badge_tagc => ($cfg->{'Parse Mode'} eq 'XML' ? ' /' : ''));
}
} elsif (defined $File->{Tentative}) {
$T->param(is_tentative => TRUE);
}
if ($File->{Opt}->{'Outline'}) {
$T->param(file_outline => $File->{heading_outline});
}
if ($File->{XMLWF_ONLY}){
$T->param(xmlwf_only => TRUE);
}
my $thispage = self_url_file($File);
$T->param(file_thispage => $thispage);
}
#
# Add a waring message to the output.
sub add_warning ($$) {
my $WID = shift;
my $params = shift;
push @{$File->{Warnings}}, $WID;
$File->{Templates}->{Result}->param($WID => TRUE, %{$params});
$File->{Templates}->{Result}->param(have_warnings => TRUE);
$File->{Templates}->{Error}->param($WID => TRUE, %{$params});
$File->{Templates}->{Error}->param(have_warnings => TRUE);
$File->{Templates}->{SOAP}->param($WID => TRUE, %{$params});
$File->{Templates}->{SOAP}->param(have_warnings => TRUE);
}
#
# Proxy authentication requests.
# Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth).
sub authenticate {
my $File = shift;
my $resource = shift;
my $authHeader = shift || {};
my $realm = $resource;
$realm =~ s([^\w\d.-]*){}g;
for my $scheme (keys(%$authHeader)) {
my $origrealm = $authHeader->{$scheme}->{realm};
if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) {
delete($authHeader->{$scheme});
next;
}
$authHeader->{$scheme}->{realm} = "$realm-$origrealm";
}
my $headers = HTTP::Headers->new(Connection => 'close');
$headers->www_authenticate(%$authHeader);
$headers = $headers->as_string();
chomp($headers);
$File->{Templates}->{AuthzReq}->param(http_401_headers => $headers);
$File->{Templates}->{AuthzReq}->param(http_401_url => $resource);
print $File->{Templates}->{AuthzReq}->output;
exit; # Further interaction will be a new HTTP request.
}
#
# Fetch an URL and return the content and selected meta-info.
sub handle_uri {
my $q = shift; # The CGI object.
my $File = shift; # The master datastructure.
my $uri = new URI (ref $q ? $q->param('uri') : $q)->canonical();
$uri->fragment(undef);
my $ua = new W3C::Validator::UserAgent ($CFG, $File);
$ua->env_proxy();
$ua->agent("W3C_Validator/$VERSION");
$ua->parse_head(0); # Don't parse the http-equiv stuff.
$ua->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']);
unless ($ua->is_protocol_supported($uri)) {
$File->{'Error Flagged'} = TRUE;
if (($uri->canonical() eq "1") )
#if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI()
{
$File->{Templates}->{Error}->param(fatal_no_content => TRUE);
}
else {
$File->{Templates}->{Error}->param(fatal_uri_error => TRUE);
$File->{Templates}->{Error}->param(fatal_uri_scheme => $uri->scheme());
}
return $File;
}
return $File unless $ua->uri_ok($uri);
my $req = new HTTP::Request(GET => $uri);
# telling caches in the middle we want a fresh copy (Bug 4998)
$req->header(Cache_control=> "max-age=0");
# if one wants to use the accept and accept-language params
# in order to trigger specific negotiation
if ($File->{Opt}->{'Accept Header'}) {
$req->header(Accept => $File->{Opt}->{'Accept Header'});
}
if ($File->{Opt}->{'Accept-Language Header'}) {
$req->header(Accept_Language => $File->{Opt}->{'Accept-Language Header'});
}
# If we got a Authorization header, the client is back at it after being
# prompted for a password so we insert the header as is in the request.
if($ENV{HTTP_AUTHORIZATION}){
$req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
}
my $res = $ua->request($req);
return $File if $File->{'Error Flagged'}; # Redirect IP rejected?
unless ($res->code == 200 or $File->{Opt}->{'No200'}) {
if ($res->code == 401) {
my %auth = $res->www_authenticate(); # HTTP::Headers::Auth
&authenticate($File, $res->request->uri, \%auth);
} else {
$File->{'Error Flagged'} = TRUE;
$File->{Templates}->{Error}->param(fatal_http_error => TRUE);
$File->{Templates}->{Error}->param(fatal_http_uri => $uri->as_string);
$File->{Templates}->{Error}->param(fatal_http_code => $res->code);
$File->{Templates}->{Error}->param(fatal_http_msg => $res->message);
$File->{Templates}->{Error}->param(fatal_http_dns => TRUE)
if $res->code == 500;
}
return $File;
}
#
# Enforce Max Recursion level.
&check_recursion($File, $res);
my ($mode, $ct, $
|