# $Id: Twig.pm.slow,v 1.60 2002/11/01 17:12:52 mrodrigu Exp $ # # Copyright (c) 1999-2002 Michel Rodriguez # All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # This is created in the caller's space BEGIN { sub ::PCDATA { '#PCDATA' } sub ::CDATA { '#CDATA' } } ###################################################################### package XML::Twig; ###################################################################### require 5.004; use strict; use bytes; # activated by speedup if version >= 5.006 use vars qw($VERSION @ISA %valid_option); use Carp; #start-extract twig_global # constants: element types use constant (PCDATA => '#PCDATA'); use constant (CDATA => '#CDATA'); use constant (PI => '#PI'); use constant (COMMENT => '#COMMENT'); use constant (ENT => '#ENT'); # element classes use constant (ELT => '#ELT'); use constant (TEXT => '#TEXT'); # element properties use constant (ASIS => '#ASIS'); use constant (EMPTY => '#EMPTY'); #end-extract twig_global # used in parseurl to set the buffer size to the same size as in XML::Parser::Expat use constant (BUFSIZE => 32768); use constant (DEBUG => 0); # used to store the gi's my %gi2index; # gi => index my @index2gi; # list of gi's my $SPECIAL_GI; # first non-special gi; my %base_ent; # base entity character => replacement # flag, set to true if the weaken sub is available use vars qw( $weakrefs); #start-extract twig_global my $REG_NAME = q{(?:(?:[^\W\d_]|\#)[\w:.-]*)}; # xml name my $REG_NAME_W = q{(?:(?:[^\W\d_]|\#)[\w:.-]*|\*)}; # name or wildcard (* or '') my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp my $REG_REGEXP_EXP = q{(?:(?:[^\\/]|\\.)*)}; # content of a regexp my $REG_REGEXP_MOD = q{(?:[eimso]*)}; # regexp modifiers my $REG_MATCH = q{[!=]~}; # match (or not) my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted) my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value my $REG_OP = q{=|==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge}; # op #end-extract twig_global my $parser_version; BEGIN { $VERSION = '3.09'; use XML::Parser; my $needVersion = '2.23'; $parser_version= $XML::Parser::VERSION; croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion; # test whether we can use weak references if( eval 'require Scalar::Util') { import Scalar::Util qw(weaken); $weakrefs= 1; } elsif( eval 'require WeakRef') { import WeakRef; $weakrefs= 1; } else { $weakrefs= 0; } # warn "weak references used\n" if( $weakrefs); import XML::Twig::Elt; import XML::Twig::Entity; import XML::Twig::Entity_list; # used to store the gi's # should be set for each twig really, at least when there are several # the init ensures that special gi's are always the same # gi => index # do NOT use => or the constants become quoted! %XML::Twig::gi2index=( PCDATA, 0, CDATA, 1, PI, 2, COMMENT, 3, ENT, 4); # list of gi's @XML::Twig::index2gi=( PCDATA, CDATA, PI, COMMENT, ENT); # gi's under this value are special $XML::Twig::SPECIAL_GI= @XML::Twig::index2gi; %XML::Twig::base_ent= ( '>' => '>', '<' => '<', '&' => '&', "'" => ''', '"' => '"', ); # now set some aliases *find_nodes = *get_xpath; *getElementsByTagName = *descendants; *descendants_or_self = *descendants; *find_by_tag_name = *descendants; *getEltById = *elt_id; } @ISA = qw(XML::Parser); # fake gi's used in twig_handlers and start_tag_handlers my $ALL = '_all_'; # the associated function is always called my $DEFAULT= '_default_'; # the function is called if no other handler has been # some defaults my $COMMENTS_DEFAULT= 'keep'; my $PI_DEFAULT = 'keep'; # handlers used in regular mode my %twig_handlers=( Start => \&twig_start, End => \&twig_end, Char => \&twig_char, Entity => \&twig_entity, XMLDecl => \&twig_xmldecl, Doctype => \&twig_doctype, Element => \&twig_element, Attlist => \&twig_attlist, CdataStart => \&twig_cdatastart, CdataEnd => \&twig_cdataend, Proc => \&twig_pi, Comment => \&twig_comment, Default => \&twig_default, ); # handlers used when twig_roots is used and we are outside of the roots my %twig_handlers_roots= ( Start => \&twig_start_check_roots, End => \&twig_end_check_roots, Doctype => \&twig_doctype, Char => undef, Entity => undef, XMLDecl => \&twig_xmldecl, Element => undef, Attlist => undef, CdataStart => undef, CdataEnd => undef, Proc => undef, Comment => undef, Proc => \&twig_pi_check_roots, Default => sub {}, # hack needed for XML::Parser 2.27 ); # handlers used when twig_roots and print_outside_roots are used and we are # outside of the roots my %twig_handlers_roots_print_2_30= ( Start => \&twig_start_check_roots_print, End => \&twig_end_check_roots_print, Char => \&twig_print, # I have no idea why I should not be using this handler! Entity => \&twig_print_entity, XMLDecl => \&twig_print, Doctype => \&twig_print_doctype, # because recognized_string is broken here # Element => \&twig_print, Attlist => \&twig_print, CdataStart => \&twig_print, CdataEnd => \&twig_print, Proc => \&twig_print, Comment => \&twig_print, Default => \&twig_print_check_doctype, ); # handlers used when twig_roots, print_outside_roots and keep_encoding are used # and we are outside of the roots my %twig_handlers_roots_print_original_2_30= ( Start => \&twig_start_check_roots_print_original, End => \&twig_end_check_roots_print_original, Char => \&twig_print_original, # I have no idea why I should not be using this handler! #Entity => \&twig_print_original, ExternEnt => \&twig_print_entity, XMLDecl => \&twig_print_original, Doctype => \&twig_print_original_doctype, # because original_string is broken here Element => \&twig_print_original, Attlist => \&twig_print_original, CdataStart => \&twig_print_original, CdataEnd => \&twig_print_original, Proc => \&twig_print_original, Comment => \&twig_print_original, Default => \&twig_print_original_check_doctype, ); # handlers used when twig_roots and print_outside_roots are used and we are # outside of the roots my %twig_handlers_roots_print_2_27= ( Start => \&twig_start_check_roots_print, End => \&twig_end_check_roots_print, Char => \&twig_print, # I have no idea why I should not be using this handler! #Entity => \&twig_print, XMLDecl => \&twig_print, Doctype => \&twig_print, CdataStart => \&twig_print, CdataEnd => \&twig_print, Proc => \&twig_print, Comment => \&twig_print, Default => \&twig_print, ); # handlers used when twig_roots, print_outside_roots and keep_encoding are used # and we are outside of the roots my %twig_handlers_roots_print_original_2_27= ( Start => \&twig_start_check_roots_print_original, End => \&twig_end_check_roots_print_original, Char => \&twig_print_original, # for some reason original_string is wrong here # this can be a problem if the doctype includes non ascii characters XMLDecl => \&twig_print, Doctype => \&twig_print, # I have no idea why I should not be using this handler! Entity => \&twig_print, #Element => undef, Attlist => undef, CdataStart => \&twig_print_original, CdataEnd => \&twig_print_original, Proc => \&twig_print_original, Comment => \&twig_print_original, Default => \&twig_print_default, # twig_print_original does not work ); my %twig_handlers_roots_print= $parser_version > 2.27 ? %twig_handlers_roots_print_2_30 : %twig_handlers_roots_print_2_27; my %twig_handlers_roots_print_original= $parser_version > 2.27 ? %twig_handlers_roots_print_original_2_30 : %twig_handlers_roots_print_original_2_27; # handlers used when the finish_print method has been called my %twig_handlers_finish_print= ( Start => \&twig_print, End => \&twig_print_end, Char => \&twig_print, Entity => \&twig_print, XMLDecl => \&twig_print, Doctype => \&twig_print, Element => \&twig_print, Attlist => \&twig_print, CdataStart => \&twig_print, CdataEnd => \&twig_print, Proc => \&twig_print, Comment => \&twig_print, Default => \&twig_print, ); # handlers used when the finish_print method has been called and the keep_encoding # option is used my %twig_handlers_finish_print_original= ( Start => \&twig_print_original, End => \&twig_print_end_original, Char => \&twig_print_original, Entity => \&twig_print_original, XMLDecl => \&twig_print_original, Doctype => \&twig_print_original, Element => \&twig_print_original, Attlist => \&twig_print_original, CdataStart => \&twig_print_original, CdataEnd => \&twig_print_original, Proc => \&twig_print_original, Comment => \&twig_print_original, Default => \&twig_print_original, ); # handlers used whithin ignored elements my %twig_handlers_ignore= ( Start => \&twig_ignore_start, End => \&twig_ignore_end, Char => undef, Entity => undef, XMLDecl => undef, Doctype => undef, Element => undef, Attlist => undef, CdataStart => undef, CdataEnd => undef, Proc => undef, Comment => undef, Default => undef, ); # those handlers are only used if the entities are NOT to be expanded my %twig_noexpand_handlers= ( Default => \&twig_default ); my @saved_default_handler; my $ID= 'id'; # default value, set by the Id argument # all allowed options %valid_option= ( # XML::Twig options TwigHandlers => 1, Id => 1, TwigRoots => 1, TwigPrintOutsideRoots => 1, StartTagHandlers => 1, EndTagHandlers => 1, IgnoreElts => 1, CharHandler => 1, KeepEncoding => 1, ParseStartTag => 1, LoadDTD => 1, DTDHandler => 1, DoNotOutputDTD => 1, NoProlog => 1, ExpandExternalEnts => 1, DiscardSpaces => 1, KeepSpaces => 1, DiscardSpacesIn => 1, KeepSpacesIn => 1, PrettyPrint => 1, EmptyTags => 1, Comments => 1, Pi => 1, OutputFilter => 1, InputFilter => 1, OutputEncoding => 1, # XML::Parser options ErrorContext => 1, ProtocolEncoding => 1, Namespaces => 1, NoExpand => 1, Stream_Delimiter => 1, ParseParamEnt => 1, NoLWP => 1, Non_Expat_Options => 1, ); # predefined input and output filters use vars qw( %filter); %filter= ( html => \&html_encode, safe => \&safe_encode, safe_hex => \&safe_encode_hex, ); sub new { my ($class, %args) = @_; my $handlers; # change all nice_perlish_names into nicePerlishNames %args= normalize_args( %args); # check options unless( $args{MoreOptions}) { foreach my $arg (keys %args) { carp "illegal option $arg" unless $valid_option{$arg}; } } # a twig is really an XML::Parser # my $self= XML::Parser->new(%args); my $self; $self= XML::Parser->new(%args); bless $self, $class; if( exists $args{TwigHandlers}) { $handlers= $args{TwigHandlers}; $self->setTwigHandlers( $handlers); delete $args{TwigHandlers}; } # take care of twig-specific arguments if( exists $args{StartTagHandlers}) { $self->setStartTagHandlers( $args{StartTagHandlers}); delete $args{StartTagHandlers}; } if( exists $args{IgnoreElts}) { $self->setIgnoreEltsHandlers( $args{IgnoreElts}); delete $args{IgnoreElts}; } $self->{twig_dtd_handler}= $args{DTDHandler}; delete $args{DTDHandler}; if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; } if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; } if( $args{ExpandExternalEnts}) { $self->set_expand_external_entities( 1); $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts delete $args{LoadDTD}; delete $args{ExpandExternalEnts}; } # deal with TwigRoots argument, a hash of elements for which # subtrees will be built (and associated handlers) if( $args{TwigRoots}) { $self->setTwigRoots( $args{TwigRoots}); delete $args{TwigRoots}; } if( exists $args{EndTagHandlers}) { croak "cannot use EndTagHandlers without TwigRoots" unless ($self->{twig_roots}); $self->setEndTagHandlers( $args{EndTagHandlers}); delete $args{EndTagHandlers}; } if( $args{TwigPrintOutsideRoots}) { croak "cannot use TwigPrintOutsideRoots without TwigRoots" unless( $self->{twig_roots}); # if the arg is a GLOB then it is a file handle, store it if( ref $args{TwigPrintOutsideRoots} eq 'GLOB' ) { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; } $self->{twig_default_print}= $args{TwigPrintOutsideRoots}; } if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); } if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}); } # space policy if( $args{KeepSpaces}) { croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces}); croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); $self->{twig_keep_spaces}=1; delete $args{KeepSpaces}; } if( $args{DiscardSpaces}) { croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); $self->{twig_discard_spaces}=1; delete $args{DiscardSpaces}; } if( $args{KeepSpacesIn}) { croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn}); $self->{twig_discard_spaces}=1; $self->{twig_keep_spaces_in}={}; my @tags= @{$args{KeepSpacesIn}}; foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; } delete $args{KeepSpacesIn}; } if( $args{DiscardSpacesIn}) { $self->{twig_keep_spaces}=1; $self->{twig_discard_spaces_in}={}; my @tags= @{$args{DiscardSpacesIn}}; foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; } delete $args{DiscardSpacesIn}; } # discard spaces by default $self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces}); $args{Comments}||= $COMMENTS_DEFAULT; if( $args{Comments} eq 'drop') { delete $twig_handlers{Comment}; } elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; } elsif( $args{Comments} ne 'process') { croak "wrong value for comments argument: $args{Comments}"; } delete $args{Comments}; $args{Pi}||= $PI_DEFAULT; if( $args{Pi} eq 'drop') { delete $twig_handlers{Pi}; } elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; } elsif( $args{Pi} ne 'process') { croak "wrong value for Pi argument: $args{Pi}"; } delete $args{Pi}; if( $args{KeepEncoding}) { $self->{twig_keep_encoding}= $args{KeepEncoding}; # set it in XML::Twig::Elt so print functions know what to do $self->set_keep_encoding( 1); $self->{parse_start_tag}= $args{ParseStartTag} || \&parse_start_tag; delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ; delete $args{KeepEncoding}; $self->{NoExpand}= 1; } else { $self->set_keep_encoding( 0); $self->{parse_start_tag}= $args{ParseStartTag} if( $args{ParseStartTag}); } if( $args{OutputFilter}) { $self->set_output_filter( $args{OutputFilter}); delete $args{OutputFilter}; } else { $self->set_output_filter( 0); } if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; } if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; } if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; } if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; } if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; } if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; } # set handlers if( $self->{twig_roots}) { if( $self->{twig_default_print}) { if( $self->{twig_keep_encoding}) { $self->setHandlers( %twig_handlers_roots_print_original); } else { $self->setHandlers( %twig_handlers_roots_print); } } else { $self->setHandlers( %twig_handlers_roots); } } else { $self->setHandlers( %twig_handlers); } # XML::Parser::Expat does not like these handler to be set. So in order to # use the various sets of handlers on XML::Parser or XML::Parser::Expat # objects when needed, these ones have to be set only once, here, at # XML::Parser level $self->setHandlers( Init => \&twig_init, Final => \&twig_final); $self->{twig_entity_list}= XML::Twig::Entity_list->new; $self->{twig_id}= $ID; $self->{twig_stored_spaces}=''; $self->{twig}= $self; weaken( $self->{twig}) if( $weakrefs); return $self; } sub parseurl { my $t= shift; return $t->_parseurl( 0, @_); } sub safe_parseurl { my $t= shift; return $t->_parseurl( 1, @_); } # I should really add extra options to allow better configuration of the # LWP::UserAgent object # this method forks: # - the child gets the data and copies it to the pipe, # - the parent reads the stream and sends it to XML::Parser # the data is cut it chunks the size of the XML::Parser::Expat buffer # the method returns the twig and the status sub _parseurl { my( $t, $safe, $url, $agent)= @_; pipe( README, WRITEME) or croak "cannot create connected pipes: $!"; if( my $pid= fork) { # parent code: parse the incoming file close WRITEME; # no need to write my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README); close README; return $@ ? 0 : $t; } else { # child close README; # no need to read require LWP; # so we can get LWP::UserAgent and HTTP::Request $|=1; $agent ||= LWP::UserAgent->new; my $request = HTTP::Request->new( GET => $url); # pass_url_content is called with chunks of data the same size as # the XML::Parser buffer my $response = $agent->request( $request, sub { pass_url_content( \*WRITEME, @_); }, BUFSIZE); $response->is_success or croak "$url ", $response->message; close WRITEME; CORE::exit(); # CORE is there for mod_perl (which redefines exit) } } # get the (hopefully!) XML data from the URL and sub pass_url_content { my( $fh, $data, $response, $protocol)= @_; print $fh $data; } sub add_options { my %args= map { $_, 1 } @_; %args= normalize_args( %args); foreach (keys %args) { $valid_option{$_}++; } } sub twig_store_internal_dtd { twig_log( twig_store_internal_dtd => @_) if( DEBUG); my( $p, $string)= @_; my $t= $p->{twig}; $string= $p->original_string() if( $t->{twig_keep_encoding}); # print STDERR "internal: $string\n"; $t->{twig_doctype}->{internal} .= $string; } sub twig_stop_storing_internal_dtd { twig_log( twig_stop_storing_internal_dtd => @_) if( DEBUG); my $p= shift; # print STDERR "\ntwig_stop_storing_internal_dtd called\n"; if( @saved_default_handler && defined $saved_default_handler[1]) { #print STDERR "restoring saved handlers\n"; $p->setHandlers( @saved_default_handler); } else { #print STDERR "resetting Default handler\n"; my $t= $p->{twig}; $p->setHandlers( Default => undef); } } sub normalize_args { my %normalized_args; while( my $key= shift ) { $key= join '', map { ucfirst } split /_/, $key; #$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig'); $normalized_args{$key}= shift ; } return %normalized_args; } sub set_handler { my( $handlers, $path, $handler)= @_; $handlers ||= {}; # create the handlers struct if necessary my $prev_handler= $handlers->{handlers}->{$path} || undef; set_gi_handler ( $handlers, $path, $handler, $prev_handler) || set_path_handler ( $handlers, $path, $handler, $prev_handler) || set_subpath_handler ( $handlers, $path, $handler, $prev_handler) || set_special_handler ( $handlers, $path, $handler, $prev_handler) || set_attribute_handler ( $handlers, $path, $handler, $prev_handler) || set_star_att_handler ( $handlers, $path, $handler, $prev_handler) || set_star_att_regexp_handler ( $handlers, $path, $handler, $prev_handler) || set_string_handler ( $handlers, $path, $handler, $prev_handler) || set_attribute_regexp_handler( $handlers, $path, $handler, $prev_handler) || set_string_regexp_handler ( $handlers, $path, $handler, $prev_handler) || set_pi_handler ( $handlers, $path, $handler, $prev_handler) || croak "unrecognized expression in handler: $path"; # this both takes care of the simple (gi) handlers and store # the handler code reference for other handlers $handlers->{handlers}->{$path}= $handler; return $prev_handler; } sub set_gi_handler { my( $handlers, $path, $handler, $prev_handler)= @_; if( $path =~ m{^\s*($REG_NAME)\s*$}o ) { my $gi= $1; # print STDERR "gi handler found: $gi\n"; $handlers->{handlers}->{gi}->{$gi}= $handler; return 1; } else { return 0; } } sub set_special_handler { my( $handlers, $path, $handler, $prev_handler)= @_; if( $path =~ m{^\s*($ALL|$DEFAULT)\s*$}o ) { $handlers->{handlers}->{$1}= $handler; return 1; } else { return 0; } } sub set_path_handler { my( $handlers, $path, $handler, $prev_handler)= @_; if( $path=~ m{^\s*(?:/$REG_NAME)*/($REG_NAME)\s*$}o) { # a full path has been defined # update the path_handlers count, knowing that # either the previous or the new handler can be undef $handlers->{path_handlers}->{gi}->{$1}-- if( $prev_handler); if( $handler) { $handlers->{path_handlers}->{gi}->{$1}++; $handlers->{path_handlers}->{path}->{$path}= $handler; } return 1; } else { return 0; } } sub set_subpath_handler { my( $handlers, $path, $handler, $prev_handler)= @_; if( $path=~ m{^\s*(?:$REG_NAME/)+($REG_NAME)\s*$}o) { # a partial path has been defined # $1 is the "final" gi $handlers->{subpath_handlers}->{gi}->{$1}-- if( $prev_handler); if( $handler) { $handlers->{subpath_handlers}->{gi}->{$1}++; $handlers->{subpath_handlers}->{path}->{$path}= $handler; } return 1; } else { return 0; } } sub set_attribute_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # check for attribute conditions if( $path=~ m{^\s*($REG_NAME) # elt \s*\[\s*\@ # [@ ($REG_NAME)\s* # att (?:=\s*($REG_STRING)\s*)? # = value (optional) \]\s*$}xo) # ] { my( $gi, $att, $val)= ($1, $2, $3); $val= substr( $val, 1, -1) if( defined $val); # remove the quotes if( $prev_handler) { # replace or remove the previous handler my $i=0; # so we can splice the array if need be foreach my $exp ( @{$handlers->{attcond_handlers_exp}->{$gi}}) { if( ($exp->{att} eq $att) && ($exp->{val} eq $val) ) { if( $handler) # just replace the handler { $exp->{handler}= $handler; } else # remove the handler { $handlers->{attcond_handlers}->{$gi}--; splice( @{$handlers->{attcond_handlers_exp}->{$gi}}, $i, 1); last; } } $i++; } } elsif( $handler) { # new handler only $handlers->{attcond_handlers}->{$gi}++; my $exp={att => $att, val => $val, handler => $handler}; $handlers->{attcond_handlers_exp}->{$gi} ||= []; push @{$handlers->{attcond_handlers_exp}->{$gi}}, $exp; } return 1; } else { return 0; } } sub set_attribute_regexp_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # check for attribute regexp conditions if( $path=~ m{^\s*($REG_NAME) # elt \s*\[\s*\@ # [@ ($REG_NAME) # att \s*=~\s* # =~ /($REG_REGEXP_EXP)/ # /regexp/ ($REG_REGEXP_MOD) # mods \s*]\s*$}gxo) # ] { my( $gi, $att, $regexp, $mods)= ($1, $2, $3, $4); $regexp= qr/(?$mods:$regexp)/; # print STDERR "\ngi: $gi - att: $att - regexp: $regexp\n"; if( $prev_handler) { # replace or remove the previous handler my $i=0; # so we can splice the array if need be foreach my $exp ( @{$handlers->{attregexp_handlers_exp}->{$gi}}) { if( ($exp->{att} eq $att) && ($exp->{regexp} eq $regexp) ) { if( $handler) # just replace the handler { $exp->{handler}= $handler; } else # remove the handler { $handlers->{attregexp_handlers}->{$gi}--; splice( @{$handlers->{attregexp_handlers_exp}->{$gi}}, $i, 1); last; } } $i++; } } elsif( $handler) { # new handler only $handlers->{attregexp_handlers}->{$gi}++; my $exp={att => $att, regexp => $regexp, handler => $handler}; $handlers->{attregexp_handlers_exp}->{$gi} ||= []; push @{$handlers->{attregexp_handlers_exp}->{$gi}}, $exp; } return 1; } else { return 0; } } sub set_string_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # check for string conditions if( $path=~/^\s*($REG_NAME) # elt \s*\[\s*string # [string \s*\(\s*($REG_NAME)?\s*\) # (sub_elt) \s*=\s* # = ($REG_STRING) # "text" (or 'text') \s*\]\s*$/ox) # ] { my( $gi, $sub_elt, $text)= ($1, $2, $3); $text= substr( $text, 1, -1) if( defined $text); # remove the quotes if( $prev_handler) { # replace or remove the previous handler my $i=0; # so we can splice the array if need be foreach my $exp ( @{$handlers->{text_handlers_exp}->{$gi}}) { if( ($exp->{text} eq $text) && ( !$exp->{sub_elt} || ($exp->{sub_elt} eq $sub_elt) ) ) { if( $handler) # just replace the handler { $exp->{handler}= $handler; } else # remove the handler { $handlers->{text_handlers}->{$gi}--; splice( @{$handlers->{text_handlers_exp}->{$gi}}, $i, 1); last; } } $i++; } } elsif( $handler) { # new handler only $handlers->{text_handlers}->{$gi}++; my $exp={sub_elt => $sub_elt, text => $text, handler => $handler}; $handlers->{text_handlers_exp}->{$gi} ||= []; push @{$handlers->{text_handlers_exp}->{$gi}}, $exp; } return 1; } else { return 0; } } sub set_string_regexp_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # check for string regexp conditions if( $path=~m{^\s*($REG_NAME) # (elt) \s*\[\s*string # [string \s*\(\s*($REG_NAME?)\) # (sub_elt) \s*=~\s* # =~ /($REG_REGEXP_EXP)/ # /(regexp)/ \s*($REG_REGEXP_MOD)? # (mods) \s*\]\s*$}ox) # ] (or ') { my( $gi, $sub_elt, $regexp, $mods)= ($1, $2, $3, $4); $mods||=""; $regexp= qr/(?$mods:$regexp)/; if( $prev_handler) { # replace or remove the previous handler my $i=0; # so we can splice the array if need be foreach my $exp ( @{$handlers->{regexp_handlers_exp}->{$gi}}) { if( ($exp->{regexp} eq $regexp) && ( !$exp->{sub_elt} || ($exp->{sub_elt} eq $sub_elt) ) ) { if( $handler) # just replace the handler { $exp->{handler}= $handler; } else # remove the handler { $handlers->{regexp_handlers}->{$gi}--; splice( @{$handlers->{regexp_handlers_exp}->{$gi}}, $i, 1); last; } } $i++; } } elsif( $handler) { # new handler only $handlers->{regexp_handlers}->{$gi}++; my $exp= {sub_elt => $sub_elt, regexp => $regexp, handler => $handler}; $handlers->{regexp_handlers_exp}->{$gi} ||= []; push @{$handlers->{regexp_handlers_exp}->{$gi}}, $exp; } return 1; } else { return 0; } } sub set_star_att_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # check for *[@att="val"] or *[@att] conditions if( $path=~/^(?:\s*\*)? # * (optionnal) \s*\[\s*\@ # [@ ($REG_NAME) # att (?:\s*=\s* # = ($REG_STRING))? # string \s*\]\s*$/ox) # ] { my( $att, $val)= ($1, $2); $val= substr( $val, 1, -1) if( defined $val); # remove the quotes from the string # print STDERR "star att handler: $path -> $att - ",$val || '', "\n"; if( $prev_handler) { # replace or remove the previous handler my $i=0; # so we can splice the array if need be foreach my $exp ( @{$handlers->{att_handlers_exp}->{$att}}) { if( ($exp->{att} eq $att) && ( !defined( $val) || ($exp->{val} eq $val) ) ) { if( $handler) # just replace the handler { $exp->{handler}= $handler; } else # remove the handler { splice( @{$handlers->{att_handlers_exp}->{$att}}, $i, 1); $handlers->{att_handlers}->{$att}--; last; } } $i++; } } elsif( $handler) { # new handler only $handlers->{att_handlers}->{$att}++; my $exp={att => $att, val => $val, handler => $handler}; $handlers->{att_handlers_exp}->{$att} ||= []; push @{$handlers->{att_handlers_exp}->{$att}}, $exp; } return 1; } else { return 0; } } sub set_star_att_regexp_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # check for *[@att=~ /regexp/] conditions if( $path=~ m{^(?:\s*\*)? # * (optionnal) \s*\[\s*\@ # [@ ($REG_NAME) # att \s*=~\s* # =~ /($REG_REGEXP_EXP)/ # /(regexp)/ \s*($REG_REGEXP_MOD)? # (mods) \s*\]\s*$}ox) # ] { my( $att, $regexp, $mods)= ($1, $2, $3); $mods||=""; $regexp= qr/(?$mods:$regexp)/; # print STDERR "star att handler: $path -> $att - ",$val || '', "\n"; if( $prev_handler) { # replace or remove the previous handler my $i=0; # so we can splice the array if need be foreach my $exp ( @{$handlers->{att_regexp_handlers_exp}->{$att}}) { if( $exp->{regexp} eq $regexp) { if( $handler) # just replace the handler { $exp->{handler}= $handler; } else # remove the handler { splice( @{$handlers->{att_regexp_handlers_exp}->{$att}}, $i, 1); $handlers->{att_regexp_handlers}--; last; } } $i++; } } elsif( $handler) { # new handler only my $exp= { regexp => $regexp, handler => $handler}; $handlers->{regexp_handlers_exp}->{$att} ||= []; push @{$handlers->{att_regexp_handlers_exp}->{$att}}, $exp; $handlers->{att_regexp_handlers}++; } return 1; } else { return 0; } } sub set_pi_handler { my( $handlers, $path, $handler, $prev_handler)= @_; # PI conditions ( '?target' => \&handler or '?' => \&handler # or '#PItarget' => \&handler or '#PI' => \&handler) if( $path=~ /^\s*(?:\?|#PI)\s*([^\s]*\s*)$/) { my $target= $1 || ''; # update the path_handlers count, knowing that # either the previous or the new handler can be undef $handlers->{pi_handlers}->{$1}= $handler; return 1; } else { return 0; } } sub setCharHandler { my( $t, $handler)= @_; $t->{twig_char_handler}= $handler; } sub reset_handlers { my $handlers= shift; delete $handlers->{handlers}; delete $handlers->{path_handlers}; delete $handlers->{subpath_handlers}; $handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers}); delete $handlers->{attcond_handlers}; } sub set_handlers { my $handlers= shift || return; my $set_handlers= {}; foreach my $path (keys %{$handlers}) { set_handler( $set_handlers, $path, $handlers->{$path}); } return $set_handlers; } sub setTwigHandler { my( $t, $path, $handler)= @_; $t->{twig_handlers} ||={}; return set_handler( $t->{twig_handlers}, $path, $handler); } sub setTwigHandlers { my( $t, $handlers)= @_; my $previous_handlers= $t->{twig_handlers} || undef; reset_handlers( $t->{twig_handlers}); $t->{twig_handlers}= set_handlers( $handlers); return $previous_handlers; } sub setStartTagHandler { my( $t, $path, $handler)= @_; return set_handler( $t->{twig_starttag_handlers}, $path,$handler); } sub setStartTagHandlers { my( $t, $handlers)= @_; my $previous_handlers= $t->{twig_starttag_handlers} || undef; reset_handlers( $t->{twig_starttag_handlers}); $t->{twig_starttag_handlers}= set_handlers( $handlers); return $previous_handlers; } sub setIgnoreEltsHandler { my( $t, $path, $action)= @_; return set_handler( $t->{twig_ignore_elts_handlers}, $path, $action ); } sub setIgnoreEltsHandlers { my( $t, $handlers)= @_; my $previous_handlers= $t->{twig_ignore_elts_handlers} || undef; reset_handlers( $t->{twig_ignore_elts_handlers}); $t->{twig_ignore_elts_handlers}= set_handlers( $handlers); return $previous_handlers; } sub setEndTagHandler { my( $t, $path, $handler)= @_; return set_handler( $t->{twig_endtag_handlers}, $path,$handler); } sub setEndTagHandlers { my( $t, $handlers)= @_; my $previous_handlers= $t->{twig_endtag_handlers} || undef; reset_handlers( $t->{twig_endtag_handlers}); $t->{twig_endtag_handlers}= set_handlers( $handlers); return $previous_handlers; } # a little more complex: set the twig_handlers only if a code ref is given sub setTwigRoots { my( $t, $handlers)= @_; my $previous_roots= $t->{twig_roots} || undef; reset_handlers($t->{twig_roots}); $t->{twig_roots}= set_handlers( $handlers); foreach my $path (keys %{$handlers}) { $t->{twig_handlers}||= {}; set_handler( $t->{twig_handlers}, $path, $handlers->{$path}) if( UNIVERSAL::isa( $handlers->{$path}, 'CODE')); } return $previous_roots; } # just store the reference to the expat object in the twig sub twig_init { twig_log( twig_init => @_) if( DEBUG); my $p= shift; my $t=$p->{twig}; $t->{twig_parser}= $p; weaken( $t->{twig_parser}) if( $weakrefs); $t->{twig_parsing}=1; # if the prolog is not needed then unset handlers # this is not robust AT ALL and needs to be fixed if( $t->{no_prolog}) { $p->setHandlers( XMLDecl => undef, Doctype => undef, Default => undef); } # in case they had been created by a previous parse delete $t->{twig_dtd}; delete $t->{twig_doctype}; delete $t->{twig_xmldecl}; # if needed set the output filehandle $t->set_fh_to_twig_output_fh(); } # uses eval to catch the parser's death sub safe_parse { my( $t, $str)= @_; eval { $t->parse( $str); } ; return $@ ? 0 : $t; } sub safe_parsefile { my( $t, $file)= @_; eval { $t->parsefile( $file); } ; return $@ ? 0 : $t; } sub add_or_discard_stored_spaces { my $t= shift; my %option= @_; if( $t->{twig_stored_spaces} || $option{force}) { if( $t->{twig_current}->is_pcdata) { $t->{twig_current}->append_pcdata($t->{twig_stored_spaces}); } else { my $current_gi= $t->{twig_current}->gi; #warn "in add_or_discard_stored_spaces, current_gi: $current_gi\n"; $t->{twig_space_policy}->{$current_gi}= space_policy( $t, $current_gi) unless defined( $t->{twig_space_policy}->{$current_gi}); if( $t->{twig_space_policy}->{$current_gi} || $option{force}) { insert_pcdata( $t, $t->{twig_stored_spaces} ); } $t->{twig_stored_spaces}=''; } } } # the default twig handlers, which build the tree sub twig_start($$%) { twig_log( twig_start => @_) if( DEBUG); my ($p, $gi, %att) = @_; my $t=$p->{twig}; # print STDERR "[start tag " . $p->original_string() ."]"; # empty the stored pcdata (space stored in case they are really part of # a pcdata element) or stored it if the space policy dictades so # create a pcdata element with the spaces if need be add_or_discard_stored_spaces( $t); my $parent= $t->{twig_current}; # if we were parsing PCDATA then we exit the pcdata if( $t->{twig_in_pcdata}) { $t->{twig_in_pcdata}= 0; delete $parent->{'twig_current'}; $parent= $parent->{parent}; } # if we choose to keep the encoding then we need to parse the tag if( my $func = $t->{parse_start_tag}) { ($gi, %att)= &$func($p->original_string); } # filter the input data if need be if( my $filter= $t->{twig_input_filter}) { $gi= $filter->( $gi); %att= map { $filter->($_), $filter->($att{$_})} keys %att; } my $elt= XML::Twig::Elt->new( $gi); $elt->{'att'}= \%att; delete $parent->{'twig_current'} if( $parent); $t->{twig_current}= $elt; $elt->{'twig_current'}=1; if( $parent) { my $prev_sibling= $parent->{last_child}; if( $prev_sibling) { $prev_sibling->{next_sibling}= $elt; $elt->set_prev_sibling( $prev_sibling); } $elt->set_parent( $parent); $parent->{first_child}= $elt unless( $parent->{first_child}); $parent->set_last_child( $elt); } else { # processing root $t->set_root( $elt); # call dtd handlerif need be $t->{twig_dtd_handler}->($t, $t->{twig_dtd}) if( defined $t->{twig_dtd_handler}); # set this so we can catch external entities # (the handler was modified during DTD processing) if( $t->{twig_default_print}) { $p->setHandlers( Default => \&twig_print); } elsif( $t->{twig_roots}) { $p->setHandlers( Default => sub { return }); } else { $p->setHandlers( Default => \&twig_default); } } if( $p->recognized_string=~ m{/>$}s) { $elt->{empty}=1; } $elt->{extra_data}= $t->{extra_data} if( $t->{extra_data}); $t->{extra_data}=''; # if the element is ID-ed then store that info my $id= $elt->{'att'}->{$ID}; if( $id) { $t->{twig_id_list}->{$id}= $elt; } # call user handler if need be if( $t->{twig_starttag_handlers}) { # call all appropriate handlers my @handlers= handler( $t, $t->{twig_starttag_handlers}, $gi, $elt); local $_= $elt; foreach my $handler ( @handlers) { $handler->($t, $elt) || last; } # call _all_ handler if needed if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL}) { $all->($t, $elt); } } # check if the tag is in the list of tags to be ignored if( $t->{twig_ignore_elts_handlers}) { my @handlers= handler( $t, $t->{twig_ignore_elts_handlers}, $gi, $elt); # only the first handler counts, it contains the action (discard/print/string) if( @handlers) { my $action= shift @handlers; $t->ignore( $action); } } } # the default function to parse a start tag (in keep_encoding mode) # can be overridden with the parse_start_tag (or parse_start_tag) method # only works for 1-byte character sets sub parse_start_tag { my $string= shift; my( $gi, %atts); # get the gi (between < and the first space, / or > character) if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s) { $gi= $1; } else { croak "internal error when parsing start tag $string"; } while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s) { $atts{$1}= $3; } return $gi, %atts; } sub set_root { my( $t, $elt)= @_; $t->{twig_root}= $elt; $elt->{twig}= $t; weaken( $elt->{twig}) if( $weakrefs); } sub twig_end($$;@) { twig_log( twig_end => @_) if( DEBUG); my ($p, $gi) = @_; my $t=$p->{twig}; if( $t->{twig_stored_spaces}) { $t->{twig_space_policy}->{$gi}= space_policy( $t, $gi) unless defined( $t->{twig_space_policy}->{$gi}); if( $t->{twig_space_policy}->{$gi}) { insert_pcdata( $t, $t->{twig_stored_spaces}) }; $t->{twig_stored_spaces}=''; } # the new twig_current is the parent my $elt= $t->{twig_current}; delete $elt->{'twig_current'}; # if we were parsing PCDATA then we exit the pcdata too if( $t->{twig_in_pcdata}) { $t->{twig_in_pcdata}= 0; $elt= $elt->{parent} if($elt->{parent}); delete $elt->{'twig_current'}; } # parent is the new current element my $parent= $elt->{parent}; $parent->{'twig_current'}=1 if( $parent); $t->{twig_current}= $parent; $elt->{extra_data_before_end_tag}= $t->{extra_data} if( $t->{extra_data}); $t->{extra_data}=''; if( $t->{twig_handlers}) { # look for handlers my @handlers= handler( $t, $t->{twig_handlers}, $gi, $elt); local $_= $elt; # so we can use $_ in the handlers foreach my $handler ( @handlers) { $handler->($t, $elt) || last; } # call _all_ handler if needed if( my $all= $t->{twig_handlers}->{handlers}->{$ALL}) { $all->($t, $elt); } } # if twig_roots is set for the element then set appropriate handler if( handler( $t, $t->{twig_roots}, $gi, $elt)) { if( $t->{twig_default_print}) { # select the proper fh (and store the currently selected one) $t->set_fh_to_twig_output_fh(); if( $t->{twig_keep_encoding}) { $p->setHandlers( %twig_handlers_roots_print_original); } else { $p->setHandlers( %twig_handlers_roots_print); } } else { $p->setHandlers( %twig_handlers_roots); } } } # return the list of handler that can be activated for an element # (either of CODE ref's or 1's for twig_roots) sub handler { my( $t, $handlers, $gi, $elt)= @_; my @found_handlers=(); my $found_handler; # warning: $elt can be either # - a regular element # - a ref to the attribute hash (when called for an element # for which the XML::Twig::Elt has not been built, outside # of the twig_roots) # - a string (case of an entity in keep_encoding mode) # check for an attribute expression with no gi if( $handlers->{att_handlers}) { my %att_handlers= %{$handlers->{att_handlers_exp}}; foreach my $att ( keys %att_handlers) { my $att_val; # get the attribute value if( ref $elt eq 'HASH') { $att_val= $elt->{$att}; } # $elt is the atts hash elsif( UNIVERSAL::isa( $elt,'XML::Twig::Elt')) { $att_val= $elt->{'att'}->{$att}; } # $elt is an element if( defined $att_val) { my @cond= @{$handlers->{att_handlers_exp}->{$att}}; foreach my $cond (@cond) { # 2 cases: either there is a val and the att value should be equal to it # or there is no val (condition was gi[@att]), just for the att to be defined if( !defined $cond->{val} || ($att_val eq $cond->{val}) ) { push @found_handlers, $cond->{handler};} } } } } # check for an attribute regexp expression with no gi if( $handlers->{att_regexp_handlers}) { my %att_handlers= %{$handlers->{att_regexp_handlers_exp}}; foreach my $att ( keys %att_handlers) { my $att_val; # get the attribute value if( ref $elt eq 'HASH') { $att_val= $elt->{$att}; } # $elt is the atts hash elsif( UNIVERSAL::isa( $elt,'XML::Twig::Elt')) { $att_val= $elt->{'att'}->{$att}; } # $elt is an element if( defined $att_val) { my @cond= @{$handlers->{att_regexp_handlers_exp}->{$att}}; foreach my $cond (@cond) { if( $att_val=~ $cond->{regexp}) { push @found_handlers, $cond->{handler};} } } } } # check for a text expression if( $handlers->{text_handlers}->{$gi}) { my @text_handlers= @{$handlers->{text_handlers_exp}->{$gi}}; foreach my $exp ( @text_handlers) { if (!$exp->{sub_elt}) { push @found_handlers, $exp->{handler} if $elt->text eq $exp->{text}; } else { foreach my $child ($elt->children($exp->{sub_elt})) { if( $child->text eq $exp->{text}) { push @found_handlers, $exp->{handler}; last; } } } } } # check for a text regexp expression if( $handlers->{regexp_handlers}->{$gi}) { my @regexp_handlers= @{$handlers->{regexp_handlers_exp}->{$gi}}; foreach my $exp ( @regexp_handlers) { if( !$exp->{sub_elt}) { push @found_handlers, $exp->{handler} if $elt->text =~ $exp->{regexp}; } else { foreach my $child ($elt->children($exp->{sub_elt})) { if( $child->text =~ $exp->{regexp}) { push @found_handlers, $exp->{handler}; last; } } } } } # check for an attribute expression if( $handlers->{attcond_handlers}->{$gi}) { my @attcond_handlers= @{$handlers->{attcond_handlers_exp}->{$gi}}; foreach my $exp ( @attcond_handlers) { my $att_val; # get the attribute value if( ref $elt eq 'HASH') { $att_val= $elt->{$exp->{att}}; } # $elt is the atts hash else { $att_val= $elt->{'att'}->{$exp->{att}}; }# $elt is an element # 2 cases: either there is a val and the att value should be equal to it # or there is no val (condition was gi[@att]), just for the att to be defined if( defined $att_val && ( !defined $exp->{val} || ($att_val eq $exp->{val}) ) ) { push @found_handlers, $exp->{handler}; } } } # check for an attribute regexp if( $handlers->{attregexp_handlers}->{$gi}) { my @attregexp_handlers= @{$handlers->{attregexp_handlers_exp}->{$gi}}; foreach my $exp ( @attregexp_handlers) { my $att_val; # get the attribute value if( ref $elt eq 'HASH') { $att_val= $elt->{$exp->{att}}; } # $elt is the atts hash else { $att_val= $elt->{'att'}->{$exp->{att}}; }# $elt is an element if( defined $att_val && ( ($att_val=~ $exp->{regexp}) ) ) { push @found_handlers, $exp->{handler}; } } } # check for a full path if( defined $handlers->{path_handlers}->{gi}->{$gi}) { my $path= $t->path( $gi); if( defined( $found_handler= $handlers->{path_handlers}->{path}->{$path}) ) { push @found_handlers, $found_handler; } } # check for a partial path if( $handlers->{subpath_handlers}->{gi}->{$gi}) { my $path= $t->path( $gi); while( $path) { # test each sub path if( defined( $found_handler= $handlers->{subpath_handlers}->{path}->{$path}) ) { push @found_handlers, $found_handler; } $path=~ s{^[^/]*/?}{}; # remove initial gi and / } } # check for a gi (simple gi's are stored directly in the handlers field) if( defined $handlers->{handlers}->{gi}->{$gi}) { push @found_handlers, $handlers->{handlers}->{gi}->{$gi}; } # if no handler found call default handler if defined if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT}) { push @found_handlers, $handlers->{handlers}->{$DEFAULT}; } return @found_handlers; # empty if no handler found } sub twig_char { twig_log( twig_char => @_) if( DEBUG); my ($p, $string)= @_; # print STDERR "[char: $string (" . $p->original_string(). ")]"; my $t=$p->{twig}; # if keep_encoding was set then use the original string instead of # the parsed (UTF-8 converted) one if( $t->{twig_keep_encoding}) { $string= $p->original_string(); } if( $t->{twig_input_filter}) { $string= $t->{twig_input_filter}->( $string); } if( $t->{twig_char_handler}) { $string= $t->{twig_char_handler}->( $string); } my $elt= $t->{twig_current}; if( $t->{twig_in_cdata}) { # text is the continuation of a previously created pcdata $elt->{cdata}.= $t->{twig_stored_spaces}.$string; } elsif( $t->{twig_in_pcdata}) { # text is the continuation of a previously created cdata $elt->{pcdata}.= $string; } else { # text is just space, which might be discarded later if( $string=~/\A\s*\Z/s) { if( $t->{extra_data}) { # we got extra data (comment, pi), lets add the spaces to it $t->{extra_data} .= $string; } else { # no extra data, just store the spaces $t->{twig_stored_spaces}.= $string; } } else { my $new_elt= insert_pcdata( $t, $t->{twig_stored_spaces}.$string); delete $elt->{'twig_current'}; $new_elt->{'twig_current'}=1; $t->{twig_current}= $new_elt; $t->{twig_in_pcdata}=1; $new_elt->{extra_data}= $t->{extra_data} if( $t->{extra_data}); $t->{extra_data}=''; } } } sub twig_cdatastart { twig_log( twig_cdatastart => @_) if( DEBUG); my $p= shift; my $t=$p->{twig}; $t->{twig_in_cdata}=1; my $cdata= XML::Twig::Elt->new( '#CDATA'); my $twig_current= $t->{twig_current}; if( $t->{twig_in_pcdata}) { # create the node as a sibling of the #PCDATA $cdata->set_prev_sibling( $twig_current); $twig_current->{next_sibling}= $cdata; my $parent= $twig_current->{parent}; $cdata->set_parent( $parent); $parent->set_last_child( $cdata); $t->{twig_in_pcdata}=0; } else { # we have to create a PCDATA element if we need to store spaces if( $t->space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) { insert_pcdata( $t, $t->{twig_stored_spaces}); } $t->{twig_stored_spaces}=''; # create the node as a child of the current element $cdata->set_parent( $twig_current); $twig_current->set_last_child( $cdata); if( my $prev_sibling= $twig_current->{first_child}) { $cdata->set_prev_sibling( $prev_sibling); $prev_sibling->{next_sibling}= $cdata; } else { $twig_current->{first_child}= $cdata; } } delete $twig_current->{'twig_current'}; $t->{twig_current}= $cdata; $cdata->{'twig_current'}=1; } sub twig_cdataend { twig_log( twig_cdataend => @_) if( DEBUG); my $p= shift; my $t=$p->{twig}; $t->{twig_in_cdata}=0; my $elt= $t->{twig_current}; delete $elt->{'twig_current'}; my $cdata= $elt->{cdata}; $elt->{cdata}= $cdata; if( $t->{twig_handlers}) { # look for handlers my @handlers= handler( $t, $t->{twig_handlers}, CDATA, $elt); local $_= $elt; # so we can use $_ in the handlers foreach my $handler ( @handlers) { $handler->($t, $elt) || last; } } $elt= $elt->{parent}; $t->{twig_current}= $elt; $elt->{'twig_current'}=1; } sub twig_pi { twig_log( twig_pi => @_) if( DEBUG); my( $p, $target, $data)= @_; my $t=$p->{twig}; if( $t->{twig_input_filter}) { $target = $t->{twig_input_filter}->( $target) ; $data = $t->{twig_input_filter}->( $data) ; } my $twig_current= $t->{twig_current}; # always defined # if pi's are to be kept then we piggiback them to the current element if( $t->{twig_keep_pi}) { if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target}) { $t->{extra_data}.= $handler->( $t, $target, $data); } elsif( $handler= $t->{twig_handlers}->{pi_handlers}->{''}) { $t->{extra_data}.= $handler->( $t, $target, $data); } else { if( $t->{twig_stored_spaces}) { $t->{extra_data}.= $t->{twig_stored_spaces}; $t->{twig_stored_spaces}= ''; } $t->{extra_data}.= $p->recognized_string(); } } else { my $pi= XML::Twig::Elt->new( PI); $pi->set_pi( $target, $data); unless( $t->root) { pi_handlers( $t, $pi, $target); return add_prolog_data( $t, $pi) } if( $t->{twig_in_pcdata}) { # create the node as a sibling of the #PCDATA $pi->paste_after( $twig_current); $t->{twig_in_pcdata}=0; } else { # we have to create a PCDATA element if we need to store spaces if( $t->space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) { insert_pcdata( $t, $t->{twig_stored_spaces}); } $t->{twig_stored_spaces}=''; # create the node as a child of the current element $pi->paste_last_child( $twig_current); } delete $twig_current->{'twig_current'}; my $parent= $pi->{parent}; $t->{twig_current}= $parent; $parent->{'twig_current'}=1; pi_handlers( $t, $pi, $target); } } sub pi_handlers { my( $t, $pi, $target)= @_; if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target}) { local $_= $pi; $handler->( $t, $pi); } elsif( $handler= $t->{twig_handlers}->{pi_handlers}->{''}) { local $_= $pi; $handler->( $t, $pi); } } sub twig_comment { twig_log( twig_comment => @_) if( DEBUG); my( $p, $data)= @_; my $t=$p->{twig}; $data= $t->{twig_input_filter}->( $data) if( $t->{twig_input_filter}); my $twig_current= $t->{twig_current}; # always defined # if comments are to be kept then we piggiback them to the current element if( $t->{twig_keep_comments}) { $t->{extra_data}.= $XML::Twig::Elt::keep_encoding ? $p->recognized_string() : $p->original_string(); return; } my $comment= XML::Twig::Elt->new( COMMENT); $comment->{comment}= $data; unless( $t->root) { add_prolog_data( $t, $comment); comment_handler( $t, $comment); return; } if( $t->{twig_in_pcdata}) { # create the node as a sibling of the #PCDATA $comment->paste_after( $twig_current); $t->{twig_in_pcdata}=0; } else { # we have to create a PCDATA element if we need to store spaces if( $t->space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) { insert_pcdata( $t, $t->{twig_stored_spaces}); } $t->{twig_stored_spaces}=''; # create the node as a child of the current element $comment->paste_last_child( $twig_current); } comment_handler( $t, $comment); delete $twig_current->{'twig_current'}; my $parent= $comment->{parent}; $t->{twig_current}= $parent; $parent->{'twig_current'}=1; } sub comment_handler { my( $t, $comment)= @_; if( $t->{twig_handlers}->{handlers}->{gi}->{'#COMMENT'}) { # look for handlers local $_= $comment; my @handlers= handler( $t, $t->{twig_handlers}, '#COMMENT', $comment); foreach my $handler ( @handlers) { $handler->($t, $comment) || last; } } } sub add_prolog_data { my($t, $prolog_data)= @_; # comment before the first element $t->{prolog_data} ||= XML::Twig::Elt->new( '#PROLOG_DATA'); # create the node as a child of the current element $prolog_data->paste_last_child( $t->{prolog_data}); } sub twig_final { twig_log( twig_final => @_) if( DEBUG); my $p= shift; my $t=$p->{twig}; # restore the selected filehandle if needed $t->set_fh_to_selected_fh(); # tries to clean-up (probably not very well at the moment) undef $p->{twig}; undef $t->{twig_parser}; undef $t->{twig_parsing}; return $t; } sub insert_pcdata { my( $t, $string)= @_; # create a new #PCDATA element my $parent= $t->{twig_current}; # always defined my $elt= XML::Twig::Elt->new( PCDATA); $elt->{pcdata}= $string; my $prev_sibling= $parent->{last_child}; if( $prev_sibling) { $prev_sibling->{next_sibling}= $elt; $elt->set_prev_sibling( $prev_sibling); } else { $parent->{first_child}= $elt; } $elt->set_parent( $parent); $parent->set_last_child( $elt); $t->{twig_stored_spaces}=''; return $elt; } sub space_policy { my( $t, $gi)= @_; my $policy; $policy=0 if( $t->{twig_discard_spaces}); $policy=1 if( $t->{twig_keep_spaces}); $policy=1 if( $t->{twig_keep_spaces_in} && $t->{twig_keep_spaces_in}->{$gi}); $policy=0 if( $t->{twig_discard_spaces_in} && $t->{twig_discard_spaces_in}->{$gi}); return $policy; } sub twig_entity($$$$$$) { twig_log( twig_entity => @_) if( DEBUG); my( $p, $name, $val, $sysid, $pubid, $ndata)= @_; my $t=$p->{twig}; my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata); $t->{twig_entity_list}->add( $ent); if( $parser_version > 2.27) { # this is really ugly, but the value of the entity is not properly # returned in the default handler if( $t->{twig_keep_encoding} && defined $ent->{val}) { my $val= $ent->{val}; $t->{twig_doctype}->{internal} .= $val =~ /"/ ? qq{'$val' } : qq{"$val" }; } } } sub twig_xmldecl { twig_log( twig_xmldecl => @_) if( DEBUG); my $p= shift; my $t=$p->{twig}; $t->{twig_xmldecl}||={}; # could have been set by set_output_encoding $t->{twig_xmldecl}->{version}= shift; $t->{twig_xmldecl}->{encoding}= shift; $t->{twig_xmldecl}->{standalone}= shift; } sub twig_doctype { twig_log( twig_doctype => @_) if( DEBUG); my( $p, $name, $sysid, $pub, $internal)= @_; my $t=$p->{twig}; $t->{twig_doctype}||= {}; # create $t->{twig_doctype}->{name}= $name; # always there $t->{twig_doctype}->{sysid}= $sysid; # $t->{twig_doctype}->{pub}= $pub; # # now let's try to cope with XML::Parser 2.28 and above if( $parser_version > 2.27) { @saved_default_handler= $p->setHandlers( Default => \&twig_store_internal_dtd, Entity => \&twig_entity, ); $p->setHandlers( DoctypeFin => \&twig_stop_storing_internal_dtd); $t->{twig_doctype}->{internal}=''; } else # for XML::Parser before 2.28 { $t->{twig_doctype}->{internal}=$internal; } # now check if we want to get the DTD info if( $t->{twig_read_external_dtd} && $sysid) { # let's build a fake document with an internal DTD # is this portable? # print STDERR "loading external DTD\n"; my $tmpfile= "twig_tmp$$"; open( TMP, ">$tmpfile") or croak "cannot create temp file $tmpfile: $!"; print TMP "; close DTD; print TMP $dtd; # add the dtd } print TMP "]>"; # close the dtd print TMP "\n"; # XML::Parser needs an element close TMP; $t->save_global_state(); # save the globals (they will be reset by the following new) my $t_dtd= XML::Twig->new; # create a temp twig $t->restore_global_state(); $t_dtd->parsefile( $tmpfile); # parse it $t->{twig_dtd}= $t_dtd->{twig_dtd}; # grab the dtd info #$t->{twig_dtd_is_external}=1; $t->{twig_entity_list}= $t_dtd->{twig_entity_list}; # grab the entity info unlink $tmpfile; } } sub twig_element { twig_log( twig_element => @_) if( DEBUG); my( $p, $name, $model)= @_; my $t=$p->{twig}; $t->{twig_dtd}||= {}; # may create the dtd $t->{twig_dtd}->{model}||= {}; # may create the model hash $t->{twig_dtd}->{elt_list}||= []; # ordered list of elements push @{$t->{twig_dtd}->{elt_list}}, $name; # store the elt $t->{twig_dtd}->{model}->{$name}= $model; # store the model if( $parser_version > 2.27) { $t->{twig_doctype}->{internal} .= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string; } } sub twig_attlist { twig_log( twig_attlist => @_) if( DEBUG); my( $p, $el, $att, $type, $default, $fixed)= @_; my $t=$p->{twig}; $t->{twig_dtd}||= {}; # create dtd if need be $t->{twig_dtd}->{$el}||= {}; # create elt if need be $t->{twig_dtd}->{$el}->{att}||= {}; # create att if need be $t->{twig_dtd}->{att}->{$el}->{$att}= {} ; $t->{twig_dtd}->{att}->{$el}->{$att}->{type}= $type; $t->{twig_dtd}->{att}->{$el}->{$att}->{default}= $default; $t->{twig_dtd}->{att}->{$el}->{$att}->{fixed}= $fixed; if( $parser_version > 2.27) { $t->{twig_doctype}->{internal} .= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string; } } sub twig_default { twig_log( twig_default => @_) if( DEBUG); my( $p, $string)= @_; my $t= $p->{twig}; # print STDERR "[default: $string (". $p->original_string(). ")]"; # process only if we have an entity return unless( $string=~ m{^&([^;]*);$}); # print STDERR "entity $string found\n"; # the entity has to be pure pcdata, or we have a problem my $ent; if( $t->{twig_keep_encoding}) { twig_char( $p, $string); $ent= substr( $string, 1, -1); } else { $ent= twig_insert_ent( $t, $string); } } sub twig_insert_ent { twig_log( twig_insert_ent => @_) if( DEBUG); my( $t, $string)=@_; # print STDERR "[set_ent $string]"; my $twig_current= $t->{twig_current}; my $ent= XML::Twig::Elt->new( '#ENT'); $ent->{ent}= $string; add_or_discard_stored_spaces( $t, force => 0); if( $t->{twig_in_pcdata}) { # create the node as a sibling of the #PCDATA $ent->set_prev_sibling( $twig_current); $twig_current->{next_sibling}= $ent; my $parent= $twig_current->{parent}; $ent->set_parent( $parent); $parent->set_last_child( $ent); # the twig_current is now the parent delete $twig_current->{'twig_current'}; $t->{twig_current}= $parent; # we left pcdata $t->{twig_in_pcdata}=0; } else { # create the node as a child of the current element $ent->set_parent( $twig_current); if( my $prev_sibling= $twig_current->{last_child}) { $ent->set_prev_sibling( $prev_sibling); $prev_sibling->{next_sibling}= $ent; } else { $twig_current->{first_child}= $ent; } $twig_current->set_last_child( $ent); } return $ent; } sub parser { return $_[0]->{twig_parser}; } # returns the declaration text (or a default one) sub xmldecl { my $t= shift; return '' unless( $t->{twig_xmldecl} || $t->{output_encoding}); my $decl_string; my $decl= $t->{twig_xmldecl}; if( $decl) { my $version= $decl->{version}; $decl_string= q{{output_encoding}) # or come from the document (in $decl->{encoding}) if( $t->{output_encoding}) { my $encoding= $t->{output_encoding}; $decl_string .= qq{ encoding="$encoding"}; } elsif( $decl->{encoding}) { my $encoding= $decl->{encoding}; $decl_string .= qq{ encoding="$encoding"}; } if( defined( $decl->{standalone})) { $decl_string .= q{ standalone="}; $decl_string .= $decl->{standalone} ? "yes" : "no"; $decl_string .= q{"}; } $decl_string .= "?>\n"; } else { my $encoding= $t->{output_encoding}; $decl_string= qq{}; } return $decl_string; } # returns the doctype text (or none) # that's the doctype just has it was in the original document sub doctype { my $t= shift; my $doctype= $t->{'twig_doctype'} or return ''; my $string= "{name}; $string .= qq{ SYSTEM "$doctype->{sysid}"} if( $doctype->{sysid}); $string .= qq{ PUBLIC "$doctype->{pub}" } if( $doctype->{pub}); $string .= "\n" . $doctype->{internal} . "\n"; return $string; } sub set_doctype { my( $t, $name, $system, $public, $internal)= @_; $t->{twig_doctype} ||= {}; my $doctype= $t->{twig_doctype}; $doctype->{name} = $name if( defined $name); $doctype->{sysid} = $system if( defined $system); $doctype->{pub} = $public if( defined $public); $doctype->{internal} = $internal if( defined $internal); } # return the dtd object sub dtd { my $t= shift; return $t->{twig_dtd}; } # return an element model, or the list of element models sub model { my $t= shift; my $elt= shift; return $t->dtd->{'model'}->{$elt} if( $elt); return sort keys %{$t->{'dtd'}->{'model'}}; } # return the entity_list object sub entity_list($) { my $t= shift; return $t->{twig_entity_list}; } # return the list of entity names sub entity_names($) { my $t= shift; return sort keys %{$t->{twig_entity_list}} ; } # return the entity object sub entity($$) { my $t= shift; my $entity_name= shift; return $t->{twig_entity_list}->{$entity_name}; } sub print_prolog { my $t= shift; my $fh= shift if( defined( $_[0]) && UNIVERSAL::isa($_[0], 'GLOB' ) ); if( $fh) { print $fh $t->prolog( @_); } else { print $t->prolog( @_); } } sub prolog { my $t= shift; my %args= @_; my $prolog=''; return $prolog if( $t->{no_prolog}); my $update_dtd = $args{Update_DTD} || ''; $prolog .= $t->xmldecl; return $prolog unless( defined $t->{'twig_doctype'} || defined $t->{no_dtd_output}); my $doctype= $t->{'twig_doctype'}; if( $update_dtd) { if( defined $doctype->{sysid} ) { $prolog .= "{name}; $prolog .= " PUBLIC \"$doctype->{pub}\"" if( $doctype->{pub}); $prolog .= " SYSTEM \"$doctype->{sysid}\"" if( $doctype->{sysid} && !$doctype->{pub}); $prolog .= "[\n"; $prolog .= $t->{twig_entity_list}->text; $prolog .= "]>\n"; } else { my $dtd= $t->{'twig_dtd'}; $prolog .= $t->dtd_text; } } else { $prolog .= "{name} if( $doctype->{name}); $prolog .= " PUBLIC \"$doctype->{pub}\"" if( $doctype->{pub}); $prolog .= " SYSTEM" if( $doctype->{sysid} && !$doctype->{pub}); $prolog .= ' "' . $doctype->{sysid} . '"' if( $doctype->{sysid}); if( $doctype->{internal}) { $prolog .= "\n" if( $doctype->{internal}=~ /^\s*[[<]/s); $prolog .= $doctype->{internal}; } unless( $parser_version > 2.27) { $prolog .= ">\n" unless( $t->{twig_no_expand}); } } # terrible hack, as I can't figure out in which case the darn prolog # should get an extra > $prolog=~ s/(>\s*)*$/>/; return $prolog; } sub print_prolog_data { my $t= shift; my $fh= shift if( defined( $_[0]) && UNIVERSAL::isa($_[0], 'GLOB' ) ); if( $fh) { print $fh $t->prolog_data( @_); } else { print $t->prolog_data( @_); } } sub prolog_data { my $t= shift; return'' unless( $t->{prolog_data}); my $prolog_data_text=''; foreach ( $t->{prolog_data}->children) { $prolog_data_text .= $_->sprint . "\n"; } return$ prolog_data_text; } sub print { my $t= shift; my $fh= shift if( defined( $_[0]) && UNIVERSAL::isa($_[0], 'GLOB') ); my %args= @_; my $old_pretty; if( defined $args{PrettyPrint}) { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); delete $args{PrettyPrint}; } my $old_empty_tag_style; if( defined $args{EmptyTags}) { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); delete $args{EmptyTags}; } if( $fh) { $t->print_prolog( $fh, %args); $t->print_prolog_data( $fh, %args); } else { $t->print_prolog( %args); $t->print_prolog_data( %args); } $t->{twig_root}->print( $fh) if( $t->{twig_root}); $t->set_pretty_print( $old_pretty) if( defined $old_pretty); $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style); } sub flush { my $t= shift; my $fh= shift if( defined( $_[0]) && UNIVERSAL::isa($_[0], 'GLOB') ); my $old_select= select $fh if( defined $fh); my $up_to= shift if( ref $_[0]); my %args= @_; my $old_pretty; if( defined $args{PrettyPrint}) { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); delete $args{PrettyPrint}; } my $old_empty_tag_style; if( $args{EmptyTags}) { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); delete $args{EmptyTags}; } # the "real" last element processed, as twig_end has closed it my $last_elt; if( $up_to) { $last_elt= $up_to; } elsif( $t->{twig_current}) { $last_elt= $t->{twig_current}->_last_child; } else { $last_elt= $t->{twig_root}; } # flush the DTD unless it has ready flushed (id root has been flushed) my $elt= $t->{twig_root}; $t->print_prolog( %args) unless( $elt->{flushed}); while( $elt) { my $next_elt; if( $last_elt && $last_elt->in( $elt)) { unless( $elt->{flushed}) { # just output the front tag print $elt->start_tag(); $elt->{'flushed'}=1; } $next_elt= $elt->{first_child}; } else { # an element before the last one or the last one, $next_elt= $elt->{next_sibling}; $elt->_flush(); $elt->delete; last if( $last_elt && ($elt == $last_elt)); } $elt= $next_elt; } select $old_select if( defined $old_select); $t->set_pretty_print( $old_pretty) if( defined $old_pretty); $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style); } # flushes up to an element # this method just reorders the arguments and calls flush sub flush_up_to { my $t= shift; my $up_to= shift; if( defined( $_[0]) && UNIVERSAL::isa($_[0], 'GLOB') ) { my $fh= shift; $t->flush( $fh, $up_to, @_); } else { $t->flush( $up_to, @_); } } # same as print except the entire document text is returned as a string sub sprint { my $t= shift; my %args= @_; my $old_pretty; if( $args{PrettyPrint}) { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); delete $args{PrettyPrint}; } my $old_empty_tag_style; if( $args{EmptyTags}) { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); delete $args{EmptyTags}; } my $prolog= $t->prolog( %args) || ''; my $prolog_data= $t->prolog_data( %args) || ''; my $string= $prolog . $prolog_data . $t->{twig_root}->sprint; $t->set_pretty_print( $old_pretty) if( $old_pretty); $t->set_empty_tag_style( $old_empty_tag_style) if( $old_empty_tag_style); return $string; } # this method discards useless elements in a tree # it does the same thing as a purge except it does not print it # the second argument is an element, the last purged element # (this argument is usually set through the purge_up_to method) sub purge { my $t= shift; my $up_to= shift; # the "real" last element processed, as twig_end has closed it my $last_elt; if( $up_to) { $last_elt= $up_to; } elsif( $t->{twig_current}) { $last_elt= $t->{twig_current}->_last_child; } else { $last_elt= $t->{twig_root}; } my $elt= $t->{twig_root}; while( $elt) { my $next_elt; if( $last_elt && $last_elt->in( $elt)) { $elt->{'flushed'}=1; $next_elt= $elt->{first_child}; } else { # an element before the last one or the last one, $next_elt= $elt->{next_sibling}; $elt->delete; last if( $elt == $last_elt); } $elt= $next_elt; } } # flushes up to an element. This method just calls purge sub purge_up_to { my $t= shift; my $up_to= shift; $t->purge( $up_to); } sub root { return $_[0]->{twig_root}; } #start-extract twig_document (used to generate XML::(DOM|GDOME)::Twig) sub first_elt { my( $t, $cond)= @_; my $root= $t->root || return undef; return $root if( $root->passes( $cond)); return $root->next_elt( $cond); } sub next_n_elt { my( $t, $offset, $cond)= @_; return $t->root->next_n_elt( $offset -1, $cond); } sub get_xpath { my $twig= shift; if( UNIVERSAL::isa( $_[0], 'ARRAY')) { my $elt_array= shift; return map { $_->get_xpath( @_) } @$elt_array; } else { return $twig->root->get_xpath( @_); } } # return a list with just the root # if a condition is given then return an empty list unless the root matches sub children { my( $t, $cond)= @_; my $root= $t->root; unless( $cond && !($root->passes( $cond)) ) { return ($root); } else { return (); } } sub _children { return ($_[0]->root); } sub descendants { my( $t, $cond)= @_; my $root= $t->root; if( $root->passes( $cond) ) { return ($root, $root->descendants( $cond)); } else { return ( $root->descendants( $cond)); } } sub subs_text { my $t= shift; $t->root->subs_text( @_); } #end-extract twig_document sub set_keep_encoding { return XML::Twig::Elt::set_keep_encoding( @_); } sub set_expand_external_entities { return XML::Twig::Elt::set_expand_external_entities( @_); } # WARNING: at the moment the id list is not updated reliably sub elt_id { return $_[0]->{twig_id_list}->{$_[1]}; } # change it in ALL twigs at the moment sub change_gi { my( $twig, $old_gi, $new_gi)= @_; my $index; return unless($index= $XML::Twig::gi2index{$old_gi}); $XML::Twig::index2gi[$index]= $new_gi; delete $XML::Twig::gi2index{$old_gi}; $XML::Twig::gi2index{$new_gi}= $index; } # builds the DTD from the stored (possibly updated) data sub dtd_text { my $t= shift; my $dtd= $t->{twig_dtd}; my $doctype= $t->{'twig_doctype'} or return ''; my $string= "{name}; unless( $parser_version > 2.27) { $string .= "[\n"; } foreach my $gi (@{$dtd->{elt_list}}) { $string.= "{'model'}->{$gi}.">\n" ; if( $dtd->{att}->{$gi}) { my $attlist= $dtd->{att}->{$gi}; $string.= "{$att}->{type} ". "$attlist->{$att}->{default}"; if( $attlist->{$att}->{fixed}) { $string .= " #FIXED"}; $string.= "\n"; } $string.= ">\n"; } } $string.= $t->entity_list->text if( $t->entity_list); $string.= "\n]>\n"; return $string; } # prints the DTD from the stored (possibly updated) data sub dtd_print { my $t= shift; my $fh= shift if( defined( $_[0]) && UNIVERSAL::isa($_[0], 'GLOB') ); if( $fh) { print $fh $t->dtd_text; } else { print $t->dtd_text; } } # build the subs that call directly expat BEGIN { my @expat_methods= qw( depth in_element within_element context current_line current_column current_byte namespace eq_name generate_ns_name new_ns_prefixes expand_ns_prefix current_ns_prefixes recognized_string original_string xpcroak xpcarp xml_escape base current_element element_index position_in_context); foreach my $method (@expat_methods) { no strict 'refs'; *{$method}= sub { my $t= shift; warn "calling $method after parsing is finished" unless( $t->{twig_parsing}); return $t->{twig_parser}->$method(\@_); }; } } sub path { my( $t, $gi)= @_; return "/" . join( "/", ($t->{twig_parser}->context, $gi)); } sub finish { my $t= shift; return $t->{twig_parser}->finish; } # just finish the parse by printing the rest of the document sub finish_print { my( $t, $fh)= @_; select $fh if( defined $fh); $t->flush; my $p=$t->{twig_parser}; if( $t->{twig_keep_encoding}) { $p->setHandlers( %twig_handlers_finish_print); } else { $p->setHandlers( %twig_handlers_finish_print_original); } } sub set_output_filter { return XML::Twig::Elt::set_output_filter( @_); } sub set_input_filter { my( $t, $input_filter)= @_; my $old_filter= $t->{twig_input_filter}; if( !$input_filter || UNIVERSAL::isa( $input_filter, 'CODE') ) { $t->{twig_input_filter}= $input_filter; } elsif( $input_filter eq 'latin1') { $t->{twig_input_filter}= latin1(); } elsif( $filter{$input_filter}) { $t->{twig_input_filter}= $filter{$input_filter}; } else { croak "invalid input filter: $input_filter"; } return $old_filter; } sub set_empty_tag_style { return XML::Twig::Elt::set_empty_tag_style( @_); } sub set_pretty_print { return XML::Twig::Elt::set_pretty_print( @_); } sub set_quote { return XML::Twig::Elt::set_quote( @_); } sub set_indent { return XML::Twig::Elt::set_indent( @_); } # save and restore package globals (the ones in XML::Twig::Elt) sub save_global_state { my $t= shift; $t->{twig_saved_state}= XML::Twig::Elt::global_state(); } sub restore_global_state { my $t= shift; XML::Twig::Elt::set_global_state( $t->{twig_saved_state}); } sub global_state { return XML::Twig::Elt::global_state(); } sub set_global_state { return XML::Twig::Elt::set_global_state( $_[1]); } sub dispose { my $t= shift; $t->DESTROY; } sub DESTROY { my $t= shift; if( $t->{twig_root} &&UNIVERSAL::isa( $t->{twig_root}, 'XML::Twig')) { $t->{twig_root}->delete } # added to break circular references undef $t->{twig}; undef $t->{twig_root}->{twig} if( $t->{twig_root}); undef $t->{twig_parser}; $t={}; # prevents memory leaks (especially when using mod_perl) undef $t; } # # non standard handlers # # kludge: expat 1.95.2 calls both Default AND Doctype handlers # so if the default handler finds ' @_) if( DEBUG); my $p= shift; my $string= $p->recognized_string(); #print STDERR "twig_print: /", $p->recognized_string(), "/\n"; if( $string eq 'setHandlers( Default => undef); $p->{twig}->{expat_1_95_2}=1; } else { print $string; } } sub twig_print { twig_log( twig_print => @_) if( DEBUG); print $_[0]->recognized_string(); } sub twig_print_default { twig_log( twig_print_default => @_) if( DEBUG); my( $p, $string)= @_; #print $string; print $p->recognized_string(); # print STDERR "twig_print_default: /", $string, "/\n"; } # recognized_string does not seem to work for entities, go figure! # so this handler is not used sub twig_print_entity { twig_log( twig_print_entity => @_) if( DEBUG); my $p= shift; } # account for the case where the element is empty sub twig_print_end { twig_log( twig_print_end => @_) if( DEBUG); my $p= shift; print $p->recognized_string(); # print $p->recognized_string() unless( $p->recognized_string()=~ /\/>\Z/); } # kludge: expat 1.95.2 calls both Default AND Doctype handlers # so if the default handler finds ' @_) if( DEBUG); my $p= shift; #warn "\ntwig_print_original (default): ", $p->original_string(), "\n"; my $string= $p->original_string(); if( $string eq 'setHandlers( Default => undef); $p->{twig}->{expat_1_95_2}=1; } else { print $string; } } sub twig_print_original { twig_log( twig_print_original => @_) if( DEBUG); print $_[0]->original_string(); } sub twig_print_original_doctype { twig_log( twig_print_original_doctype => @_) if( DEBUG); my( $p, $name, $sysid, $pubid, $internal)= @_; #warn "\n in twig_print_original_doctype\n original_string: " . $p->original_string . # " name: $name, sysid: $sysid, pubid: $pubid, internal: $internal\n"; if( $name) { # with recent versions of XML::Parser original_string does not work, # hence we need to rebuild the doctype declaration my $doctype= qq{} unless( $p->{twig}->{expat_1_95_2}); print $doctype; } $p->setHandlers( Default => \&twig_print_original); } sub twig_print_doctype { twig_log( twig_print_doctype => @_) if( DEBUG); my( $p, $name, $sysid, $pubid, $internal)= @_; #warn "\n in twig_print_original_doctype\n string: " . $p->recognized_string . # " name: $name, sysid: $sysid, pubid: $pubid, internal: $internal\n"; if( $name) { # with recent versions of XML::Parser original_string does not work, # hence we need to rebuild the doctype declaration my $doctype= qq{} unless( $p->{twig}->{expat_1_95_2}); print $doctype; } $p->setHandlers( Default => \&twig_print_original); } sub twig_print_original_default { twig_log( twig_print_original_default => @_) if( DEBUG); my $p= shift; print $p->original_string(); # print STDERR "DEFAULT[", $p->original_string(), "]"; } # account for the case where the element is empty sub twig_print_end_original { twig_log( twig_print_end_original => @_) if( DEBUG); my $p= shift; print $p->original_string(); # print $p->original_string() unless( $p->original_string()=~ /\/>\Z/); } sub twig_start_check_roots { twig_log( twig_start_check_roots => @_) if( DEBUG); my( $p, $gi, %att)= @_; my $t= $p->{twig}; if( $p->depth == 0) { twig_start( $p, $gi, %att); } elsif( handler( $t, $t->{twig_roots}, $gi, \%att)) { $p->setHandlers( %twig_handlers); # restore regular handlers twig_start( $p, $gi, %att); } elsif( $t->{twig_starttag_handlers}) { # look for start tag handlers my @handlers= handler( $t, $t->{twig_starttag_handlers}, $gi, \%att); foreach my $handler ( @handlers) { $handler->($t, $gi, %att) || last; } } } sub twig_start_check_roots_print { twig_log( twig_start_check_roots_print => @_) if( DEBUG); my( $p, $gi, %att)= @_; my $t= $p->{twig}; if( $p->depth == 0) { if( my $fh= $t->{twig_output_fh}) { print $fh $p->recognized_string(); } else { print $p->recognized_string(); } twig_start( $p, $gi, %att); $t->set_fh_to_twig_output_fh(); } elsif( handler( $t, $t->{twig_roots}, $gi, \%att)) { $p->setHandlers( %twig_handlers); # restore regular handlers $t->set_fh_to_selected_fh(); # select the proper output fh twig_start( $p, $gi, %att); } elsif( $t->{twig_starttag_handlers}) { # look for start tag handlers my @handlers= handler( $t, $t->{twig_starttag_handlers}, $gi, \%att); my $last_handler_res; $t->set_fh_to_selected_fh() if( @handlers); # select the proper output fh foreach my $handler ( @handlers) { $last_handler_res= $handler->($t, $gi, %att); last unless $last_handler_res; } $t->set_fh_to_twig_output_fh() if( @handlers); # re-select the twig output fh print $p->recognized_string() if( !@handlers || $last_handler_res); } else { print $p->recognized_string(); } } sub twig_start_check_roots_print_original { twig_log( twig_start_check_roots_print_original => @_) if( DEBUG); my( $p, $gi, %att)= @_; my $t= $p->{twig}; if( $p->depth == 0) { if( my $fh= $t->{twig_output_fh}) { print $fh $p->original_string(); } else { print $p->original_string(); } twig_start( $p, $gi, %att); $t->set_fh_to_twig_output_fh(); } elsif( handler( $t, $t->{twig_roots}, $gi, \%att)) { $p->setHandlers( %twig_handlers); # restore regular handlers $t->set_fh_to_selected_fh(); # select the proper output fh twig_start( $p, $gi, %att); } elsif( $t->{twig_starttag_handlers}) { # look for start tag handlers my @handlers= handler( $t, $t->{twig_starttag_handlers}, $gi, \%att); my $last_handler_res; $t->set_fh_to_selected_fh() if( @handlers); # select the proper output fh foreach my $handler ( @handlers) { $last_handler_res= $handler->($t, $gi, %att); last unless( $last_handler_res); } $t->set_fh_to_twig_output_fh() if( @handlers); # re-select the twig output fh print $p->original_string() if( !@handlers || $last_handler_res); } else { print $p->original_string(); } } sub twig_end_check_roots { twig_log( twig_end_check_roots => @_) if( DEBUG); my( $p, $gi)= @_; my $t= $p->{twig}; if( $t->{twig_endtag_handlers}) { # look for start tag handlers my @handlers= handler( $t, $t->{twig_endtag_handlers}, $gi, {}); my $last_handler_res=1; foreach my $handler ( @handlers) { $last_handler_res= $handler->($t, $gi) || last; } } if( $p->depth == 0) { twig_end( $p, $gi); } } sub twig_end_check_roots_print { twig_log( twig_end_check_roots_print => @_) if( DEBUG); my( $p, $gi, %att)= @_; my $t= $p->{twig}; if( $t->{twig_endtag_handlers}) { # look for start tag handlers my @handlers= handler( $t, $t->{twig_endtag_handlers}, $gi, {}); my $last_handler_res=1; $t->set_fh_to_selected_fh() if( @handlers); # select the proper output fh foreach my $handler ( @handlers) { $last_handler_res= $handler->($t, $gi) || last; } $t->set_fh_to_twig_output_fh() if( @handlers); # re-select the twig output fh return unless $last_handler_res; } print $p->recognized_string(); $t->set_fh_to_selected_fh() if( $p->depth == 0); if( $p->depth == 0) { twig_end( $p, $gi); } } sub twig_end_check_roots_print_original { twig_log( twig_end_check_roots_print_original => @_) if( DEBUG); my( $p, $gi, %att)= @_; my $t= $p->{twig}; if( $t->{twig_endtag_handlers}) { # look for start tag handlers my @handlers= handler( $t, $t->{twig_endtag_handlers}, $gi, {}); my $last_handler_res=1; $t->set_fh_to_selected_fh() if( @handlers); # select the proper output fh foreach my $handler ( @handlers) { $last_handler_res= $handler->($t, $gi) || last; } $t->set_fh_to_twig_output_fh() if( @handlers); # re-select the twig output fh return unless $last_handler_res; } print $p->original_string(); $t->set_fh_to_selected_fh() if( $p->depth == 0); if( $p->depth == 0) { twig_end( $p, $gi); } } sub twig_pi_check_roots { my( $p, $target, $data)= @_; my $t= $p->{twig}; if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target} || $t->{twig_handlers}->{pi_handlers}->{''} ) { twig_pi( @_); } } sub twig_ignore_start { twig_log( twig_ignore_start => @_) if( DEBUG); my( $p, $gi)= @_; my $t= $p->{twig}; return unless( $gi eq $t->{twig_ignore_gi}); $t->{twig_ignore_level}++; my $action= $t->{twig_ignore_action}; if( $action eq 'print' ) { if( $t->{twig_keep_encoding}) { twig_print_original( @_); } else { twig_print( @_); } } elsif( $action eq 'string' ) { if( $t->{twig_keep_encoding}) { $t->{twig_buffered_string} .= $p->original_string(); } else { $t->{twig_buffered_string} .= $p->recognized_string(); } } } sub twig_ignore_end { twig_log( twig_ignore_end => @_) if( DEBUG); my( $p, $gi)= @_; my $t= $p->{twig}; my $action= $t->{twig_ignore_action}; if( $action eq 'print') { if( $t->{twig_keep_encoding}) { twig_print_original( $p, $gi); } else { twig_print( $p, $gi); } } elsif( $action eq 'string') { if( $t->{twig_keep_encoding}) { $t->{twig_buffered_string} .= $p->original_string(); } else { $t->{twig_buffered_string} .= $p->recognized_string(); } } return unless( $gi eq $t->{twig_ignore_gi}); $t->{twig_ignore_level}--; unless( $t->{twig_ignore_level}) { $t->{twig_ignore_elt}->delete; $p->setHandlers( @{$t->{twig_saved_handlers}}) }; } sub ignore { my $t= shift; my $elt; # get the element (default: current elt) if( $_[0] && UNIVERSAL::isa( $_[0], 'XML::Twig::Elt')) { $elt= shift; } else { $elt = $t->{twig_current}; } my $action= shift || 1; $t->{twig_ignore_action}= $action; $t->{twig_ignore_elt}= $elt; # save it $t->{twig_ignore_gi}= $XML::Twig::index2gi[$elt->{'gi'}]; # save its gi $t->{twig_ignore_level}++; my $p= $t->{twig_parser}; my @saved_handlers= $p->setHandlers( %twig_handlers_ignore); # set handlers if( $action eq 'print') { if( $t->{twig_keep_encoding}) { $p->setHandlers( Default => \&twig_print_original); } else { $p->setHandlers( Default => \&twig_print); } } elsif( $action eq 'string') { # not used at the moment $t->{twig_buffered_string}=''; if( $t->{twig_keep_encoding}) { $p->setHandlers( Default => \&twig_buffer_original); } else { $p->setHandlers( Default => \&twig_buffer_original); } } $t->{twig_saved_handlers}= \@saved_handlers; # save current handlers } # select $t->{twig_output_fh} and store the current selected fh sub set_fh_to_twig_output_fh { my $t= shift; my $output_fh= $t->{twig_output_fh}; if( $output_fh && !$t->{twig_output_fh_selected}) { # there is an output fh $t->{twig_selected_fh}= select(); # store the currently selected fh $t->{twig_output_fh_selected}=1; select $output_fh; # select the output fh for the twig } } # select the fh that was stored in $t->{twig_selected_fh} # (before $t->{twig_output_fh} was selected) sub set_fh_to_selected_fh { my $t= shift; return unless( $t->{twig_output_fh}); my $selected_fh= $t->{twig_selected_fh}; $t->{twig_output_fh_selected}=0; select $selected_fh; return; } sub encoding { return $_[0]->{twig_xmldecl}->{encoding} if( $_[0]->{twig_xmldecl}); } sub set_encoding { my( $t, $encoding)= @_; $t->{twig_xmldecl} ||={}; $t->set_xml_version( "1.0") unless( $t->xml_version); return $t->{twig_xmldecl}->{encoding}= $encoding; } sub output_encoding { return $_[0]->{output_encoding}; } sub set_output_encoding { my( $t, $encoding)= @_; $t->set_output_filter( encoding_filter( $encoding)); return $t->{output_encoding}= $encoding; } sub xml_version { return $_[0]->{twig_xmldecl}->{version} if( $_[0]->{twig_xmldecl}); } sub set_xml_version { my( $t, $version)= @_; $t->{twig_xmldecl} ||={}; return $t->{twig_xmldecl}->{version}= $version; } sub standalone { return $_[0]->{twig_xmldecl}->{standalone} if( $_[0]->{twig_xmldecl}); } sub set_standalone { my( $t, $standalone)= @_; $t->{twig_xmldecl} ||={}; $t->set_xml_version( "1.0") unless( $t->xml_version); return $t->{twig_xmldecl}->{standalone}= $standalone; } # SAX methods sub toSAX1 { shift(@_)->toSAX(@_, \&XML::Twig::Elt::start_tag_data_SAX1, \&XML::Twig::Elt::end_tag_data_SAX1 ); } sub toSAX2 { shift(@_)->toSAX(@_, \&XML::Twig::Elt::start_tag_data_SAX2, \&XML::Twig::Elt::end_tag_data_SAX2 ); } sub toSAX { my( $t, $handler, $start_tag_data, $end_tag_data) = @_; die "cannot use toSAX while parsing" if (defined $t->{twig_parser}); if( my $start_document = $handler->can( 'start_document')) { $start_document->( $handler); } $t->prolog_toSAX( $handler); $t->root->toSAX( $handler, $start_tag_data, $end_tag_data) if( $t->root); if( my $end_document = $handler->can( 'end_document')) { $end_document->( $handler); } } sub flush_toSAX1 { shift(@_)->flush_toSAX(@_, \&XML::Twig::Elt::start_tag_data_SAX1, \&XML::Twig::Elt::end_tag_data_SAX1 ); } sub flush_toSAX2 { shift(@_)->flush_toSAX(@_, \&XML::Twig::Elt::start_tag_data_SAX2, \&XML::Twig::Elt::end_tag_data_SAX2 ); } sub flush_toSAX { my( $t, $handler, $start_tag_data, $end_tag_data, $up_to)= @_; # the "real" last element processed, as twig_end has closed it my $last_elt; if( $up_to) { $last_elt= $up_to; } elsif( $t->{twig_current}) { $last_elt= $t->{twig_current}->_last_child; } else { $last_elt= $t->{twig_root}; } # flush the DTD unless it has ready flushed (id root has been flushed) my $elt= $t->{twig_root}; $t->prolog_toSAX( $handler) unless( $elt->{flushed}); while( $elt) { my $next_elt; if( $last_elt && $last_elt->in( $elt)) { unless( $elt->{flushed}) { # just output the front tag if( my $start_element = $handler->can( 'start_element')) { $start_element->( $handler, $start_tag_data->( $elt)); } $elt->{'flushed'}=1; } $next_elt= $elt->{first_child}; } else { # an element before the last one or the last one, $next_elt= $elt->{next_sibling}; $elt->_flush_toSAX( $start_tag_data, $end_tag_data); $elt->delete; last if( $last_elt && ($elt == $last_elt)); } $elt= $next_elt; } } sub prolog_toSAX { my( $t, $handler)= @_; $t->xmldecl_toSAX( $handler); $t->DTD_toSAX( $handler); } sub xmldecl_toSAX { my( $t, $handler)= @_; my $decl= $t->{twig_xmldecl}; my $data= { Version => $decl->{version}, Encoding => $decl->{encoding}, Standalone => $decl->{standalone}, }; if( my $xml_decl= $handler->can( 'xml_decl')) { $xml_decl->( $handler, $data); } } sub DTD_toSAX { my( $t, $handler)= @_; my $doctype= $t->{twig_doctype}; my $data= { Name => $doctype->{name}, PublicId => $doctype->{pub}, SystemId => $doctype->{sysid}, }; if( my $start_dtd= $handler->can( 'start_dtd')) { $start_dtd->( $handler, $data); } # I should call code to export the internal subset here if( my $end_dtd= $handler->can( 'end_dtd')) { $end_dtd->( $handler); } } # input/output filters sub latin1 { if( eval 'require Text::Iconv;') { #warn "using iconv"; return iconv_convert( 'latin1'); } elsif( eval 'require Unicode::Map8 && require Unicode::String;') { #warn "using unicode convert"; return unicode_convert( 'latin1'); } else { return \®exp2latin1; } } sub encoding_filter { my $encoding= $_[1] || $_[0]; if( eval 'require Encode') { import Encode; my $sub= encode_convert( $encoding); return $sub; } if( eval 'require Text::Iconv;') { return iconv_convert( $encoding); } elsif( eval 'require Unicode::Map8 && require Unicode::String;') { return unicode_convert( $encoding); } croak "Encode, Text::Iconv or Unicode::Map8 and Unicode::String need to be installed ", "in order to use encoding options"; } # shamelessly lifted from XML::TyePYX (works only with XML::Parse 2.27) sub regexp2latin1 { my $text=shift; $text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1); my $lo = ord($2); chr((($hi & 0x03) <<6) | ($lo & 0x3F)) }ge; return $text; } sub html_encode { require HTML::Entities; return HTML::Entities::encode(latin1->($_[0])); } sub safe_encode { my $str= shift; $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)} {XmlUtf8Decode($1)}egs; return $str; } sub safe_encode_hex { my $str= shift; $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)} {XmlUtf8Decode($1, 1)}egs; return $str; } # this one shamelessly lifted from XML::DOM sub XmlUtf8Decode { my ($str, $hex) = @_; my $len = length ($str); my $n; if ($len == 2) { my @n = unpack "C2", $str; $n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); } elsif ($len == 3) { my @n = unpack "C3", $str; $n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f); } elsif ($len == 4) { my @n = unpack "C4", $str; $n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); } elsif ($len == 1) # just to be complete... { $n = ord ($str); } else { croak "bad value [$str] for XmlUtf8Decode"; } $hex ? sprintf ("&#x%x;", $n) : "&#$n;"; } sub unicode_convert { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly require Unicode::Map8; require Unicode::String; import Unicode::String qw(utf8); my $sub= eval q{ { my $cnv; BEGIN { $cnv= Unicode::Map8->new($enc) or croak "Can't create converter to $enc"; } sub { return $cnv->to8 (utf8($_[0])->ucs2); } } }; unless( $sub) { croak $@; } return $sub; } sub iconv_convert { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly require Text::Iconv; my $sub= eval q{ { my $cnv; BEGIN { $cnv = Text::Iconv->new( 'utf8', $enc) or croak "Can't create iconv converter to $enc"; } sub { return $cnv->convert( $_[0]); } } }; unless( $sub) { if( $@=~ m{^Unsupported conversion: Invalid argument}) { croak "Unsupported encoding: $enc"; } else { croak $@; } } return $sub; } sub encode_convert { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly my $sub= eval qq{sub { return encode( "$enc", \$_[0]); } }; croak "can't create Encode-based filter: $@" unless( $sub); return $sub; } sub twig_log { my $handler= shift; print "$handler\n"; } 1; ###################################################################### package XML::Twig::Entity_list; ###################################################################### sub new { my $class = shift; my $self={}; bless $self, $class; return $self; } sub add_new_ent { my $list= shift; my $ent= XML::Twig::Entity->new( @_); $list->add( $ent); } sub add { my( $list, $ent)= @_; $list->{$ent->{name}}= $ent; } # can be called with an entity or with an entity name sub delete { my $list= shift; if( ref $_[0] eq 'XML::Twig::Entity_list') { # the second arg was an entity my $ent= shift; delete $list->{$ent->{name}}; } else { # the second arg was not entity, must be a string then my $name= shift; delete $list->{$name}; } } sub print { my ($ent_list, $fh)= @_; my $old_select= select $fh if( defined $fh); foreach my $ent_name ( sort keys %{$ent_list}) { my $ent= $ent_list->{$ent_name}; # we have to test what the entity is or un-defined entities can creep in $ent->print() if( UNIVERSAL::isa( $ent, 'XML::Twig::Entity')); } select $old_select if( defined $old_select); } sub text { my ($ent_list)= @_; return join "\n", map { $ent_list->{$_}->text} sort keys %{$ent_list}; } sub list { my ($ent_list)= @_; return @{[$ent_list]}; } 1; ###################################################################### package XML::Twig::Entity; ###################################################################### sub new { my( $ent, $name, $val, $sysid, $pubid, $ndata)= @_; my $self={}; $self->{name}= $name; if( $val) { $self->{val}= $val; } else { $self->{sysid}= $sysid; $self->{pubid}= $pubid; $self->{ndata}= $ndata; } bless $self; return $self; } sub name { return $_[0]->{name}; } sub val { return $_[0]->{val}; } sub sysid { return $_[0]->{sysid}; } sub pubid { return $_[0]->{pubid}; } sub ndata { return $_[0]->{ndata}; } sub print { my ($ent, $fh)= @_; if( $fh) { print $fh $ent->text . "\n"; } else { print $ent->text . "\n"; } } sub text { my ($ent)= @_; if( exists $ent->{'val'}) { if( $ent->{'val'}=~ /"/) { return "{'name'} '$ent->{'val'}'>"; } return "{'name'} \"$ent->{'val'}\">"; } elsif( $ent->{'sysid'}) { my $text= "{'name'} "; $text .= "SYSTEM \"$ent->{'sysid'}\" " if( $ent->{'sysid'}); $text .= "PUBLIC \"$ent->{'pubid'}\" " if( $ent->{'pubid'}); $text .= "NDATA $ent->{'ndata'}" if( $ent->{'ndata'}); $text .= ">"; return $text; } } 1; ###################################################################### package XML::Twig::Elt; ###################################################################### use Carp; use constant PCDATA => '#PCDATA'; use constant CDATA => '#CDATA'; use constant PI => '#PI'; use constant COMMENT => '#COMMENT'; use constant ENT => '#ENT'; use constant ASIS => '#ASIS'; # pcdata elements not to be XML-escaped use constant ELT => '#ELT'; use constant TEXT => '#TEXT'; use constant EMPTY => '#EMPTY'; use constant CDATA_START => " "]]>"; use constant PI_START => " "?>"; use constant COMMENT_START => ""; use constant XMLNS_URI => 'http://www.w3.org/2000/xmlns/'; my $XMLNS_URI = XMLNS_URI; BEGIN { # set some aliases for methods *tag = *gi; *set_tag = *set_gi; *find_nodes = *get_xpath; *field = *first_child_text; *trimmed_field = *first_child_trimmed_text; *is_field = *contains_only_text; *is = *passes; *matches = *passes; *has_child = *first_child; *all_children_pass = *all_children_are; *all_children_match= *all_children_are; *getElementsByTagName= *descendants; *find_by_tag_name= *descendants_or_self; *first_child_is = *first_child_matches; *last_child_is = *last_child_matches; *next_sibling_is = *next_sibling_matches; *prev_sibling_is = *prev_sibling_matches; *next_elt_is = *next_elt_matches; *prev_elt_is = *prev_elt_matches; *parent_is = *parent_matches; *child_is = *child_matches; # try using weak references # test whether we can use weak references if( eval 'require Scalar::Util') { import Scalar::Util qw(weaken); } elsif( eval 'require WeakRef') { import WeakRef; } } # can be called as XML::Twig::Elt->new( [[$gi, $atts, [@content]]) # - gi is an optional gi given to the element # - $atts is a hashref to attributes for the element # - @content is an optional list of text and elements that will # be inserted under the element sub new { my $class= shift; my $self = {}; bless ($self, $class); return $self unless @_; # if a gi is passed then use it my $gi= shift; $self->set_gi( $gi); my $atts= shift if( ref $_[0] eq 'HASH'); if( $gi eq PCDATA) { $self->{pcdata}= shift; } elsif( $gi eq ENT) { $self->{ent}= shift; } elsif( $gi eq CDATA) { $self->{cdata}= shift; } elsif( $gi eq COMMENT) { $self->{comment}= shift; } elsif( $gi eq PI) { $self->set_pi( shift, shift); } else { # the rest of the arguments are the content of the element $self->set_content( @_) if @_; } if( $atts) { # the attribute hash can be used to pass the asis status if( defined $atts->{ASIS}) { $self->set_asis; delete $atts->{ASIS}; } $self->{'att'}= $atts if( keys %$atts); } return $self; } # this function creates an XM:::Twig::Elt from a string # it is quite clumsy at the moment, as it just creates a # new twig then returns its root # there might also be memory leaks there # additional arguments are passed to new XML::Twig sub parse { my $class= shift; my $string= shift; my %args= @_; my $t= XML::Twig->new(%args); $t->parse( $string); my $self= $t->root; # clean-up the node delete $self->{twig}; # get rid of the twig data delete $self->{twig_current}; # better get rid of this too return $self; } sub set_gi { my ($elt, $gi)= @_; unless( defined $XML::Twig::gi2index{$gi}) { # new gi, create entries in %gi2index and @index2gi push @XML::Twig::index2gi, $gi; $XML::Twig::gi2index{$gi}= $#XML::Twig::index2gi; } $elt->{gi}= $XML::Twig::gi2index{$gi}; } sub gi { return $XML::Twig::index2gi[$_[0]->{gi}]; } # return #ELT for an element and #PCDATA... for others sub get_type { my $gi_nb= $_[0]->{gi}; # the number, not the string return ELT if( $gi_nb > $XML::Twig::SPECIAL_GI); return $_[0]->gi; } # return the gi if it's a "real" element, 0 otherwise sub is_elt { return $_[0]->gi if( $_[0]->{gi} > $XML::Twig::SPECIAL_GI); return 0; } sub is_pcdata { my $elt= shift; return (exists $elt->{'pcdata'}); } sub is_cdata { my $elt= shift; return (exists $elt->{'cdata'}); } sub is_pi { my $elt= shift; return (exists $elt->{'target'}); } sub is_comment { my $elt= shift; return (exists $elt->{'comment'}); } sub is_ent { my $elt= shift; return (exists $elt->{ent} || $elt->{ent_name}); } sub is_text { my $elt= shift; return (exists( $elt->{'pcdata'}) || (exists $elt->{'cdata'})); } sub is_empty { return $_[0]->{empty} || 0; } sub set_empty { $_[0]->{empty}= 1 unless( ($_[0]->{'empty'} || 0)); } sub set_not_empty { delete $_[0]->{empty} if( ($_[0]->{'empty'} || 0)); } sub set_asis { my $elt=shift; if( (exists $elt->{'pcdata'})) { $elt->{asis}= 1; return; } if( (exists $elt->{'cdata'})) { $elt->set_gi( PCDATA); $elt->{pcdata}= $elt->{cdata}; $elt->{asis}= 1; return; } foreach my $pcdata ($elt->descendants( PCDATA)) { $pcdata->{asis}= 1;} foreach my $cdata ($elt->descendants( CDATA)) { $elt->set_gi( PCDATA); $elt->{pcdata}= $elt->{cdata}; delete $elt->{cdata}; $elt->{asis}= 1; } } sub set_not_asis { my $elt=shift; $elt->{asis}= 0 if $elt->{asis}; foreach my $pcdata ($elt->descendants()) { delete $pcdata->{asis} if $elt->{asis};} } sub is_asis { return $_[0]->{asis}; } sub closed { my $elt= shift; my $t= $elt->twig || return; my $curr_elt= $t->{twig_current}; return unless( $curr_elt); return $curr_elt->in( $elt); } sub set_pcdata { delete $_[0]->{empty} if( $_[0]->is_empty); return( $_[0]->{'pcdata'}= $_[1]); } sub append_pcdata { return( $_[0]->{'pcdata'}.= $_[1]); } sub pcdata { return $_[0]->{pcdata}; } sub set_data { return( $_[0]->{'data'}= $_[1]); } sub data { return $_[0]->{data}; } sub append_extra_data { return( $_[0]->{extra_data}.= $_[1]); } sub set_extra_data { return( $_[0]->{extra_data}= $_[1]); } sub extra_data { return $_[0]->{extra_data}; } sub set_target { return( $_[0]->{'target'}= $_[1]); } sub target { return $_[0]->{target}; } sub set_pi { $_[0]->{target}= $_[1]; $_[0]->{data}= $_[2]; } sub pi_string { return PI_START . $_[0]->{target} . " " . $_[0]->{data} . PI_END; } sub set_comment { return( $_[0]->{comment}= $_[1]); } sub comment { return $_[0]->{comment}; } sub comment_string { return COMMENT_START . $_[0]->{comment} . COMMENT_END; } sub set_ent { return( $_[0]->{ent}= $_[1]); } sub ent { return $_[0]->{ent}; } sub ent_name { return substr( $_[0]->{ent}, 1, -1);} sub set_cdata { delete $_[0]->{empty} if( $_[0]->is_empty); return( $_[0]->{'cdata'}= $_[1]); } sub append_cdata { return( $_[0]->{'cdata'}.= $_[1]); } sub cdata { return $_[0]->{'cdata'}; } sub cdata_string { return CDATA_START . $_[0]->{cdata} . CDATA_END; } #start-extract twig_node sub contains_only_text { my $elt= shift; return 0 unless $elt->is_elt; foreach my $child ($elt->children) { return 0 if $child->is_elt; } return 1; } sub contains_only { my( $elt, $exp)= @_; my @children= $elt->children; foreach my $child (@children) { return 0 unless $child->is( $exp); } return @children; } sub root { my $elt= shift; while( $elt->{parent}) { $elt= $elt->{parent}; } return $elt; } #end-extract twig_node sub twig { my $elt= shift; my $root= $elt->root; return $root->{twig}; } #start-extract twig_node # returns undef or the element, depending on whether $elt passes $cond # $cond can be # - empty: the element passes the condition # - ELT ('#ELT'): the element passes the condition if it is a "real" element # - TEXT ('#TEXT'): the element passes if it is a CDATA or PCDATA element # - a string: the element passes if its gi is equal to the string # - a regexp: the element passes if its gi matches the regexp # - a code ref: the element passes if the code, applied on the element, # returns true my %cond_cache; # expression => coderef { # my %cond_cache; # expression => coderef sub install_cond { my $cond= shift; my $sub; my $test; my $original_cond= $cond; my $not= ($cond=~ s{^\s*!}{}) ? '!' : ''; if( ref $cond eq 'CODE') { return $cond; } if( ref $cond eq 'Regexp') { $test = qq{(\$_[0]->gi=~ /$cond/)}; } else { # the condition is a string if( $cond eq ELT) { $test = qq{\$_[0]->is_elt}; } elsif( $cond eq TEXT) { $test = qq{\$_[0]->is_text}; } elsif( $cond=~ m{^\s*($REG_NAME_W)\s*$}o) { # gi if( $1 ne '*') { # 2 options, depending on whether the gi exists in gi2index # start optimization my $gi= $XML::Twig::gi2index{$1}; if( $gi) { # the gi exists, use its index as a faster shortcut $test = qq{ \$_[0]->{gi} eq "$XML::Twig::gi2index{$1}"}; } else # end optimization { # it does not exist (but might be created later), compare the strings $test = qq{ \$_[0]->gi eq "$1"}; } } else { $test = qq{ (1) } } } elsif( $cond=~ m{^\s*($REG_REGEXP)\s*$}o) { # /regexp/ $test = qq{ \$_[0]->gi=~ $1 }; } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s*\[\s*(\!\s*)?\@($REG_NAME)\s*\]\s*$}o) { # gi[@att] my( $gi, $not, $att)= ($1, $2, $3); $not||=''; if( $gi && ($gi ne '*')) { $test = qq{ (\$_[0]->gi eq "$gi") && $not(defined \$_[0]->{'att'}->{"$att"}) }; } else { $test = qq{ $not (defined \$_[0]->{'att'}->{"$att"})}; } } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1 \[\@($REG_NAME) # [@$2 \s*($REG_OP)\s* # = (or other op) $3 ($REG_VALUE) # "$4" or '$4' \s*\]\s*$}xo) # ] { # gi[@att="val"] my( $gi, $att, $op, $string)= ($1, $2, op( $3), $4); if( $gi && ($gi ne '*')) { $test = qq{ (\$_[0]->gi eq "$gi") && (defined \$_[0]->{'att'}->{"$att"}) && ( \$_[0]->{'att'}->{"$att"} $op $string) }; } else { $test = qq{ (defined \$_[0]->{'att'}->{"$att"}) && ( \$_[0]->{'att'}->{"$att"} $op $string) }; } } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1 \[\@($REG_NAME) # [@$2 \s*($REG_MATCH)\s* # =~ or !~ ($3) ($REG_REGEXP) # /$4/ \s*\]\s*$}xo) # ] { # gi[@att=~ /regexp/] or gi[@att!~/regexp/] my( $gi, $att, $match, $regexp)= ($1, $2, $3, $4); if( $gi && ($gi ne '*')) { $test = qq{ (\$_[0]->gi eq "$gi") && ( defined \$_[0]->{'att'}->{"$att"}) && ( \$_[0]->{'att'}->{"$att"} $match $regexp) }; } else { # *[@att=~/regexp/ or *[@att!~/regexp/ $test = qq{( defined \$_[0]->{'att'}->{"$att"}) && ( \$_[0]->{'att'}->{"$att"} $match $regexp) }; } } elsif( $cond=~ m{^\s*\@($REG_NAME)\s*$}o) { # @att (or !@att) my( $att)= ($1); $test = qq{ (defined \$_[0]->{'att'}->{"$att"})}; } elsif( $cond=~ m{^\s* \@($REG_NAME) # @$1 \s*($REG_OP)\s* # = (or other op) $2 ($REG_VALUE) # "$3" or '$3' \s*$}xo) { # @att="val" my( $att, $op, $string)= ( $1, op( $2), $3); $test = qq{ (defined \$_[0]->{'att'}->{"$att"}) && ( \$_[0]->{'att'}->{"$att"} $op $string) }; } elsif( $cond=~ m{^\s* \@($REG_NAME) # [@$1 \s*(=~|!~)\s* # =~ or !~ ($2) ($REG_REGEXP) # /$3/ \s*\s*$}xo) # ] { # @att=~ /regexp/ or @att!~/regexp/ my( $att, $match, $regexp)= ( $2, $3, $4); $test = qq{( defined \$_[0]->{'att'}->{"$att"}) && ( \$_[0]->{'att'}->{"$att"} $match $regexp) }; } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1 \[\s*text(?:\(\s*\))? # [text() \s*($REG_OP)\s* # = or other op ($2) ($REG_VALUE) # "$3" or '$3' \s*\]\s*$}xo) # ] { # gi[text()= "val"] my ($gi, $op, $text)= ($1, op( $2), $3); if( $gi && ($gi ne '*')) { $test = qq{(\$_[0]->gi eq "$gi") && ( \$_[0]->text eq $text)}; } else { $test = qq{ \$_[0]->text eq $text }; } } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1 \[\s*text(?:\(\s*\))? # [text() \s*($REG_MATCH)\s* # =~ or !~ ($2) ($REG_REGEXP) # /$3/ \s*\]\s*$}xo) # ] { # gi[text()=~ /regexp/] my( $gi, $match, $regexp)= ($1, $2, $3); if( $gi && ($gi ne '*')) { $test = qq{(\$_[0]->gi eq "$gi") && ( \$_[0]->text $match $regexp) }; } else { $test = qq{ \$_[0]->text $match $regexp }; } } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1 \[\s*text\s*\(\s* # [text( ($REG_NAME)\s*\) # $2) \s*($REG_OP)\s* # = or other op $3 ($REG_VALUE) # "$4" or '$4' \s*\]\s*$}xo) # ] { # gi[text(gi2)= "text"] my ($gi, $gi2, $op, $text)= ($1, $2, op($3), $4); $text=~ s/([{}])/\\$1/g; #warn "gi: $gi - gi2: $gi2 - text: $text"; if( $gi && ($gi ne '*')) { $test = qq{ (\$_[0]->gi eq "$gi") && ( \$_[0]->first_child( qq{$gi2\[text() $op $text]})) }; } else { $test = qq{ \$_[0]->first_child(qq{$gi2\[text() $op $text]}) } ; } #warn "$cond: $test\n"; } elsif( $cond=~ m{^\s*($REG_NAME_W)?\s* # $1 \[\s*text\(\s* # [text( ($REG_NAME)\s*\) # $2) \s*(=~|!~)\s* # =~ or !~ ($3) ($REG_REGEXP) # /$4/ \s*\]\s*$}xo) # ] { # gi[text(gi2)=~ /regexp/] my( $gi, $gi2, $match, $regexp)= ($1, $2, $3, $4); if( $gi && ($gi ne '*')) { $test = qq{ (\$_[0]->gi eq "$gi") && ( \$_[0]->field( "$gi2") $match $regexp) }; } else { $test = qq{\$_[0]->field( "$gi2") $match $regexp}; } } else { croak "wrong condition $original_cond"; } } #warn "\n$original_cond: $test"; my $s= eval "sub { return \$_[0] if( $not($test)) }"; if( $@) { # warn "sub: $sub"; croak "wrong navigation condition $original_cond ($@);" } return $s; } sub op { my $op= shift; if( $op eq '=') { $op= 'eq'; } elsif( $op eq '!=') { $op= 'ne'; } return $op; } sub passes { my( $elt, $cond)= @_; return $elt unless $cond; my $sub= ($cond_cache{$cond} ||= install_cond( $cond)); return $sub->( $elt); } } # end-extract twig_nodes sub my_passes { my( $elt, $cond)= @_; return $elt unless $cond; unless( ref $cond) { # the condition is a string if( $cond eq ELT) { return $elt if $elt->is_elt; } elsif( $cond eq TEXT) { return $elt if $elt->is_text; } else { return $elt if $XML::Twig::index2gi[$elt->{'gi'}] eq $cond; } } elsif( ref $cond eq 'Regexp') { return $elt if $XML::Twig::index2gi[$elt->{'gi'}]=~ $cond; } elsif( ref $cond eq 'CODE') { return $elt if $cond->($elt); } return undef; } sub set_parent { $_[0]->{parent}= $_[1]; weaken( $_[0]->{parent}) if( $XML::Twig::weakrefs); # warn "weakening parent\n" if( $XML::Twig::weakrefs); } #start-extract twig_node sub parent { my $elt= shift; my $cond= shift || return $elt->{parent}; do { $elt= $elt->{parent} || return; } until (!$elt || $elt->passes( $cond)); return $elt; } #end-extract twig_node sub set_first_child { delete $_[0]->{empty} if( $_[0]->is_empty); $_[0]->{'first_child'}= $_[1]; } #start-extract twig_node sub first_child { my $elt= shift; my $cond= shift || return $elt->{first_child}; my $child= $elt->{first_child}; my $test_cond= ($cond_cache{$cond} ||= install_cond( $cond)); while( $child && !$test_cond->( $child)) { $child= $child->{next_sibling}; } return $child; } #end-extract twig_node sub _first_child { return $_[0]->{first_child}; } sub _last_child { return $_[0]->{last_child}; } sub _next_sibling { return $_[0]->{next_sibling}; } sub _prev_sibling { return $_[0]->{prev_sibling}; } sub _parent { return $_[0]->{parent}; } # sets a field # arguments $record, $cond, @content sub set_field { my $record = shift; my $cond = shift; my $child= $record->first_child( $cond); my $new_field= XML::Twig::Elt->new( @_); if( $child) { $new_field->replace( $child); } else { $new_field->paste( last_elt => $record); } return $new_field; } sub set_last_child { delete $_[0]->{empty} if( $_[0]->is_empty); $_[0]->{'last_child'}= $_[1]; weaken( $_[0]->{'last_child'}) if( $XML::Twig::weakrefs); } #start-extract twig_node sub last_child { my $elt= shift; my $cond= shift || return $elt->{last_child}; my $test_cond= ($cond_cache{$cond} ||= install_cond( $cond)); my $child= $elt->{last_child}; while( $child && !$test_cond->( $child) ) { $child= $child->{prev_sibling}; } return $child } #end-extra