# -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- # npxml.pm - A handy XML manipuration package # -- # (C) Nekopps, Perl & PHP scripting service (http://uketama.nekopps.com/). # Changes: 2008/12/10 begin, 2008/12/30 v.1.01, 2010/10/05 v.1.02 # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- package NpXML; require 5.006; our $VERSION = '1.02'; our $DEBUG = 0; use strict; use warnings; my %default = ( # for input data => undef, strict => 1, forcearray => undef, attr_prefix => '@', id_prefix => '#', content_collapse => 1, content_remain => 0, unescaping => 1, # for output tree => undef, file => undef, indent => " ", br => "\n", sortorder => '$a cmp $b', escaping => 1, ); *xml_unserialize = \&NpXML::XMLFuncs::xml_unserialize; *xml_serialize = \&NpXML::XMLFuncs::xml_serialize; # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- # [Object methods] # -- # new # set # get # data # tree # in # out # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub new { my $class = shift; my $self = {param => {%default}}; bless $self, $class; $self->set(@_) or return undef; $self; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub set { my $self = shift; if(@_ % 2) { warn "Parameters must be pairs: name => value"; return undef; } my %params = @_; for(keys %params) { warn "Unkown parameter key $_" unless exists $default{$_}; $self->{param}{$_} = $params{$_}; } $self; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub get { my $self = shift; for(@_){ warn "Unkown parameter key $_" unless exists $default{$_}; } @{$self->{param}}{@_}; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub data { my($self, $data) = @_; if(defined $data) { $self->{param}{data} = $data; return $self; } $self->{param}{data}; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub tree { my($self, $tree) = @_; if(defined $tree) { $self->{param}{tree} = $tree; return $self; } $self->{param}{tree}; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub in { my $self = shift; $self->set(@_) or return undef; my $ret = xml_unserialize($self->{param}{data}, %{$self->{param}}); if($ret) { $self->{param}{tree} = $ret; } else { warn "Failed to unserialize XML rawdata"; } $self; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub out { my $self = shift; $self->set(@_) or return undef; my $ret = xml_serialize($self->{param}{tree}, %{$self->{param}}); if($ret) { $self->{param}{data} = $ret unless $self->{param}{file}; } else { warn "Failed to serialize XML tree"; } $self; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub dump { my($self, $fname) = @_; my $old = undef; if(length $fname) { my $fh; if(open $fh, ">", $fname) { eval{ flock $fh, 2}; binmode $fh; } else { warn "Cannot open file $fname with write-mode: $!"; return undef; } $old = select $fh; } require Data::Dumper; print Data::Dumper::Dumper($self->{param}{tree}); select $old if $old; $self; } package NpXML::XMLFuncs; # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- # [XML manipulations] # -- # $ xml_escape # $ xml_unescape # @ xml_markup_declaration : find and parse XML markups # $|B xml_unserialize : generate a tree from XML # $|B xml_serialize : make the tree one string # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub xml_escape { local $_ = shift; s/\&/&/g; s//>/g; s/\"/"/g; s/\'/&\#39;/g; return $_; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub xml_unescape { local $_ = shift; # -- modified 2010/10/05 # -- purpose: the '&' was carelessly double unescaped # -- into s/<//g; s/"/\"/g; s/&\#(\d+);/chr($1)/eg; s/&/&/g; # -- from # s/&/&/g; # s/<//g; # s/"/\"/g; # s/&\#(\d+);/chr($1)/eg; # -- end return $_; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub xml_markup_declaration { my $data = shift; unless(ref $data eq "SCALAR") { warn "First argument is must be reference to scalar"; return undef; } my %cond = ( tag => undef, # tag name to find, may be code ispi => undef, # whether PI or not isopen => undef, # whether open-tag or not isclose => undef, # whether close-tag or not isalone => undef, # whether alone-tag or not attr => undef, # attribute to find, may be array|hash|code @_ ); if(my $at = $cond{attr}) { unless(ref($at) eq "CODE") { $at = [$at] unless ref $at; $at = {map {$_ => undef} @$at} if ref $at eq "ARRAY"; $cond{attr} = $at; } } my $start = pos($$data)||0; my %mup = ( tag => undef, prop => [], position => undef, ispi => 0, isopen => 0, isclose => 0, isalone => 0, between => undef, # the text in between next and previous ); my $state = undef; while($$data =~ /<([\?\!])?/sg) { my $q = $1 || ""; next if $q eq "!"; # no supported my $s = pos $$data; $mup{position} = $s - ($q ? 2 : 1); my %prop; # for attr matching FindEnd: while($$data =~ /\Q$q\E>/sg) { %prop = (); $mup{prop} = []; $state = substr $$data, $s, pos($$data)-$s-($q ? 2 : 1); while($state =~ /([\w\:]+)\=([\"\'])/sg) # ensure quotings match { my $s2 = pos $state; my($name, $quote) = ($1, $2); { next FindEnd unless $state =~ /\Q$quote\E/sg; redo if substr($state, pos($state)-2,1) eq "\\"; } my $value = substr $state, $s2, pos($state)-$s2-1; $value =~ s/\\\Q$quote\E/$quote/sg; push @{$mup{prop}}, [$name, $value]; $prop{$name} = $value; } last; } continue { undef($state); } # now, we got the statement print "markup: <$q$state$q>\n" if $DEBUG; $state =~ s/^\s+//s; $state =~ s/\s+$//s; $mup{ispi} = $q ? 1 : 0; $mup{isclose} = int($state =~ s|^/||); $mup{isalone} = int($state =~ s|/$||); $mup{isopen} = $mup{ispi}+$mup{isclose}+$mup{isalone} ? 0 : 1; $mup{tag} = (split /\s/, $state, 2)[0]; # test tag name if(defined $cond{tag}) { my $res = 1; my $tag = $cond{tag}; if(ref $tag eq "CODE") { local $_ = $mup{tag}; $res = eval{ $tag->(); }; if($@){ warn "Code evalution error: $tag"; return undef; } next unless $res; } else { $res = $mup{tag} eq $cond{tag}; } next unless $res; } # test tag sort next unless sub{ for(qw/ispi isopen isclose isalone/) {return 0 if defined($cond{$_}) && $mup{$_} != $cond{$_};} 1; }->(); # test attributes if(defined $cond{attr}) { my $res = 1; my $attr = $cond{attr}; if(ref $attr eq "CODE") { local $_ = \%prop; $res = eval{ $attr->(); }; if($@){ warn "Code evalution error: $attr"; return undef; } next unless $res; } else { for my $name(keys %$attr) { $res = 0, last unless exists $prop{$name}; $res = $prop{$name} eq $attr->{$name} if defined $attr->{$name}; last unless $res; } next unless $res; } } # successfuly done $mup{between} = substr $$data, $start, $mup{position}-$start; last; } continue { undef($state); } $state ? \%mup : undef; } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub xml_unserialize { my $data = shift; my %args = ( strict => 1, forcearray => undef, attr_prefix => '-', id_prefix => '#', content_collapse => 1, content_remain => 0, unescaping => 1, @_, ); if(my $fa = $args{forcearray}) { $fa = [$fa] unless ref $fa; $fa = {map {$_ => 1} @$fa} unless ref $fa eq "HASH"; $args{forcearray} = $fa; } if($data !~ /; close $fh; } if($args{strict}) { my $cur = pos $data; my $mup = xml_markup_declaration(\$data); pos($data) = $cur; unless( $mup && $mup->{ispi} && $mup->{tag} =~ /^xml\z/i && $mup->{between} !~ /\S/s ) { warn "Not started from XML declaration "; return undef; } } # let's read the document my $tree = {}; local *read_element = sub { my($elem, $depth) = @_; my $start = pos($data)||0; $depth = 0 unless defined $depth; if($depth > 100){ warn "Over 100 depth at $start"; return undef; } print +('-'x$depth)."element: $elem->{'!tag'}\n" if $DEBUG; my $mup = undef; my %cond = (); # important(2008/12/28): # it does not make a child from contents starting from non-space my $cur = pos $data; $mup = xml_markup_declaration(\$data); # pre-read once pos($data) = $cur; if($mup && $mup->{isopen} && $mup->{between} =~ /\S/s) { # force to find a close-tag for this %cond = (tag => $elem->{"!tag"}, isclose => 1); } my $loop = 0; while($mup = xml_markup_declaration(\$data, %cond)) { if($loop++ > 1000){ warn "Over 1000 loop at ".(pos $data); return undef; } my($tag, $prop, $ispi, $isopen, $isclose, $isalone, $position, $between); eval "\$$_ = \$mup->{$_}" for keys %$mup; # for easy use variables print "between: $between\n" if $DEBUG; if($isclose) { my $tagname = $elem->{"!tag"}; if($tagname && $tagname eq $tag) { my $cont = substr $data, $start, $position-$start; if($args{unescaping}){ $cont = xml_unescape($cont); } $elem->{"!content"} = $cont; } else { warn "Missmatched close-tag '$tag' at $position"; return undef; } return 1; } # make a child my $child = {map {$args{attr_prefix}.$_->[0] => $_->[1]} @$prop}; if($args{unescaping}){ for(values %$child){ $_ = xml_unescape($_); } } $child->{"!attrs"} = [map{$_->[0]} @$prop] if @$prop; if(my $id = $child->{$args{attr_prefix}."id"}){ $tree->{$args{id_prefix}.$id} = $child; } if($ispi){ $child->{"!ispi"} = 1; } # down into the element if necessary if($isopen) { $child->{"!tag"} = $tag; read_element($child, $depth+1) or return undef; delete $child->{"!tag"}; } # content collapse, and remaining if(my @keys = keys %$child) { if(@keys == 1 && $args{content_collapse}) { if(exists $child->{"!content"}) { $child = $child->{"!content"}; } } elsif(@keys > 1 && ! $args{content_remain}) { for(@keys) { my $pre = substr $_, 0, 1; if($pre ne $args{attr_prefix} && $pre ne "!") { delete $child->{"!content"}; last; } } } } # deal with repetition tags if(exists $elem->{$tag}) { $elem->{$tag} = [$elem->{$tag}] unless ref $elem->{$tag} eq "ARRAY"; push @{$elem->{$tag}}, $child; } else { if(exists $args{forcearray}->{$tag}) { $elem->{$tag} = [$child]; } else { $elem->{$tag} = $child; } } # note the order of children for later $elem->{"!childs"} ||= []; unless(grep {$_ eq $tag} @{$elem->{"!childs"}}) { push @{$elem->{"!childs"}}, $tag; } } $elem; }; read_element($tree); } # -- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- sub xml_serialize { my $tree = shift; my %args = ( attr_prefix => '-', id_prefix => '#', file => undef, indent => " ", br => "\n", sortorder => '$a cmp $b', escaping => 1, @_ ); return $tree unless ref $tree; my($file, $indent, $br) = @args{qw/file indent br/}; # check to the sort-routine right { my($a,$b) = qw/abc xyz/; my $res = eval $args{sortorder}; if($@) { warn "Error within the sort-routine: $@"; return undef; } if($res !~ /^\-?\d+$/) { warn "Wrong result '$res' from the sort-routine"; return undef; } } # make a filehandle my($fh, $tname); if($file) { if(open $fh, ">", $file) { eval{ flock $fh, 2 }; seek $fh, 0, 0; binmode $fh; } else { warn "Cannot open the file($file) with write-mode: $!"; return undef; } } else { require File::Temp; ($fh, $tname) = File::Temp::tempfile(); unless($fh) { warn "Cannot make temporary file"; return undef; } } # the subroutine which serialize each elements local *write_element = sub { my($elem, $depth) = @_; return undef if ref $elem ne "HASH"; $depth = 0 unless defined $depth; if($depth > 100){ warn "Over 100 depth at tag ".($elem->{"!tag"}); return undef; } my $attrs = ""; my @attrs = $elem->{"!attrs"} ? @{$elem->{"!attrs"}} : ( sort {length $a <=> length $b} map {substr $_, length($args{attr_prefix})} grep {0 == index $_, $args{attr_prefix}} keys %$elem ); if(@attrs) { $attrs = join " ", '', map { my $val = $elem->{$args{attr_prefix}.$_}; if($args{escaping}){ $val = xml_escape($val); } $val =~ s/\"/\\\"/sg; qq($_="$val"); } @attrs; } my @childs = $elem->{"!childs"} ? @{$elem->{"!childs"}} : ( sort { # to move arrays at the tail my($r1,$r2) = (ref $elem->{$a}, ref $elem->{$b}); $r1 eq $r2 ? 0 : $r1 eq "ARRAY" ? 1 : -1; } grep { my $pre = substr $_, 0, 1; $pre ne $args{attr_prefix} && $pre ne "!"; } sort {eval $args{sortorder}} keys %$elem ); my $tag = $elem->{"!tag"}; my $left = $indent x $depth; if($elem->{"!ispi"}) { print "$left$br"; return 1; } unless(@childs) { my $cont = exists $elem->{"!content"} ? $elem->{"!content"} : ""; if($args{escaping}){ $cont = xml_escape($cont); } print "$left<$tag$attrs>$cont$br"; return 1; } print "$left<$tag$attrs>$br" if $tag; for(@childs) { my $ref = ref $elem->{$_}; unless($ref) { $elem->{$_} = {"!content" => $elem->{$_}}; $ref = ref $elem->{$_}; } my @items = $ref eq "ARRAY" ? @{$elem->{$_}} : $ref eq "HASH" ? $elem->{$_} : (); unless(@items){ warn "Unsupported reference type($ref)"; } for my $item(@items) { $item->{"!tag"} = $_; write_element($item, $depth+1) or return undef; delete $item->{"!tag"}; } } print "$left$br" if $tag; }; # let's print out data my $oldfh = select $fh; write_element($tree, -1); close $fh; select $oldfh; return 1 if $file; if(open $fh, $tname) { eval{ flock $fh, 1 }; local $/ = undef; return <$fh>; } else { warn "Cannot get temporary contents"; return undef; } } 1;