File Coverage

File:blib/lib/XML/Twig.pm
Coverage:90.0%

linestmtbrancondsubpodtimecode
1# $Id: /xmltwig/trunk/Twig_pm.slow 33 2008-04-30T08:03:41.004487Z mrodrigu $
2#
3# Copyright (c) 1999-2004 Michel Rodriguez
4# All rights reserved.
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the same terms as Perl itself.
8#
9
10# This is created in the caller's space
11BEGIN
12
92
6
272
318
{ sub ::PCDATA { '#PCDATA' }
13
5
494
  sub ::CDATA { '#CDATA' }
14}
15
16
92
92
92
32154
489
888
use UNIVERSAL qw(isa);
17
18######################################################################
19package XML::Twig;
20######################################################################
21
22require 5.004;
23
92
92
92
798
215
628
use strict;
24
25
92
92
92
43724
329
799
use utf8; # > perl 5.5
26
27
92
92
92
794
269
810
use vars qw($VERSION @ISA %valid_option);
28
92
92
92
787
249
4719
use Carp;
29
30
92
92
92
818
293
1205
use File::Spec;
31
92
92
92
842
232
1778
use File::Basename;
32
33
92
92
92
811
243
786
use UNIVERSAL qw(isa);
34
35# constants: element types
36
92
92
92
816
231
1144
use constant (PCDATA => '#PCDATA');
37
92
92
92
971
234
749
use constant (CDATA => '#CDATA');
38
92
92
92
791
244
657
use constant (PI => '#PI');
39
92
92
92
785
225
656
use constant (COMMENT => '#COMMENT');
40
92
92
92
779
233
678
use constant (ENT => '#ENT');
41
42# element classes
43
92
92
92
786
218
800
use constant (ELT => '#ELT');
44
92
92
92
801
284
792
use constant (TEXT => '#TEXT');
45
46# element properties
47
92
92
92
810
220
660
use constant (ASIS => '#ASIS');
48
92
92
92
775
291
647
use constant (EMPTY => '#EMPTY');
49
50# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
51
92
92
92
974
222
648
use constant (BUFSIZE => 32768);
52
53
54# used to store the gi's
55my %gi2index; # gi => index
56my @index2gi; # list of gi's
57my $SPECIAL_GI; # first non-special gi;
58my %base_ent; # base entity character => replacement
59
60# flag, set to true if the weaken sub is available
61
92
92
92
803
228
754
use vars qw( $weakrefs);
62
63# flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs
64# wrt doctype handling. This is global for performance reasons.
65my $expat_1_95_2=0;
66
67# xml name (leading # allowed)
68# first line is for perl 5.005, second line for modern perl, that accept character classes
69my $REG_NAME = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*)}; # does not work for leading non-ascii letters
70   $REG_NAME = q{(?:(?:[[:alpha:]:#])(?:[\w.-]*:)?[\w.-]*)}; # > perl 5.5
71
72# name or wildcard (* or '') (leading # allowed)
73my $REG_NAME_W = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # does not work for leading non-ascii letters
74   $REG_NAME_W = q{(?:(?:[[:alpha:]:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # > perl 5.5
75
76my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp
77my $REG_REGEXP_EXP = q{(?:(?:[^\\/]|\\.)*)}; # content of a regexp
78my $REG_REGEXP_MOD = q{(?:[eimso]*)}; # regexp modifiers
79my $REG_MATCH = q{[!=]~}; # match (or not)
80my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted)
81my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number
82my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value
83my $REG_OP = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=}; # op
84my $REG_FUNCTION = q{(?:string|text)\(\s*\)};
85my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)};
86my $REG_COMP = q{(?:>=|<=|!=|<|>|=)};
87
88
89# used in the handler trigger code
90my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or))*)};
91my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]};
92
93# not all axis, only supported ones (in get_xpath)
94my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self',
95                      'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self'
96                    );
97my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")";
98
99# only used in the "xpath"engine (for get_xpath/findnodes) for now
100my $REG_PREDICATE_ALT = qr{\[(?:(?:string\(\s*\)|\@$REG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]};
101
102# used to convert XPath tests on strings to the perl equivalent
103my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
104
105my $parser_version;
106my( $FB_HTMLCREF, $FB_XMLCREF);
107
108BEGIN
109{
110
92
630
$VERSION = '3.33';
111
112
92
92
92
1109
312
1473
use XML::Parser;
113
92
414
my $needVersion = '2.23';
114
92
355
$parser_version= $XML::Parser::VERSION;
115
92
965
croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
116
117
92
689
if( $] >= 5.008)
118
92
92
92
92
270
1231
472
1400
  { eval "use Encode qw( :all)";
119
92
3887
    $FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF;
120
92
407
    $FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
121  }
122
123# test whether we can use weak references
124# set local empty signal handler to trap error messages
125
92
92
282
765
{ local $SIG{__DIE__};
126
92
3016
  if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken))
127
92
92
1263
1076
    { import Scalar::Util( 'weaken'); $weakrefs= 1; }
128  elsif( eval( 'require WeakRef'))
129
0
0
0
0
    { import WeakRef; $weakrefs= 1; }
130  else
131
0
0
    { $weakrefs= 0; }
132}
133
134
92
1089
import XML::Twig::Elt;
135
92
939
import XML::Twig::Entity;
136
92
934
import XML::Twig::Entity_list;
137
138# used to store the gi's
139# should be set for each twig really, at least when there are several
140# the init ensures that special gi's are always the same
141
142# gi => index
143# do NOT use => or the constants become quoted!
144
92
1245
%XML::Twig::gi2index=( PCDATA, 0, CDATA, 1, PI, 2, COMMENT, 3, ENT, 4);
145# list of gi's
146
92
810
@XML::Twig::index2gi=( PCDATA, CDATA, PI, COMMENT, ENT);
147
148# gi's under this value are special
149
92
379
$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
150
151
92
1068
%XML::Twig::base_ent= ( '>' => '&gt;', '<' => '&lt;', '&' => '&amp;', "'" => '&apos;', '"' => '&quot;',);
152
153# now set some aliases
154
92
511
*find_nodes = *get_xpath; # same as XML::XPath
155
92
449
*findnodes = *get_xpath; # same as XML::LibXML
156
92
420
*getElementsByTagName = *descendants;
157
92
415
*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt
158
92
407
*find_by_tag_name = *descendants;
159
92
405
*getElementById = *elt_id;
160
92
416
*getEltById = *elt_id;
161
92
524
*toString = *sprint;
162}
163
164@ISA = qw(XML::Parser);
165
166# fake gi's used in twig_handlers and start_tag_handlers
167my $ALL = '_all_'; # the associated function is always called
168my $DEFAULT= '_default_'; # the function is called if no other handler has been
169
170# some defaults
171my $COMMENTS_DEFAULT= 'keep';
172my $PI_DEFAULT = 'keep';
173
174
175# handlers used in regular mode
176my %twig_handlers=( Start => \&_twig_start,
177                    End => \&_twig_end,
178                    Char => \&_twig_char,
179                    Entity => \&_twig_entity,
180                    XMLDecl => \&_twig_xmldecl,
181                    Doctype => \&_twig_doctype,
182                    Element => \&_twig_element,
183                    Attlist => \&_twig_attlist,
184                    CdataStart => \&_twig_cdatastart,
185                    CdataEnd => \&_twig_cdataend,
186                    Proc => \&_twig_pi,
187                    Comment => \&_twig_comment,
188                    Default => \&_twig_default,
189                    ExternEnt => \&_twig_extern_ent,
190      );
191
192# handlers used when twig_roots is used and we are outside of the roots
193my %twig_handlers_roots=
194  ( Start => \&_twig_start_check_roots,
195    End => \&_twig_end_check_roots,
196    Doctype => \&_twig_doctype,
197    Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl,
198    Element => undef, Attlist => undef, CdataStart => undef,
199    CdataEnd => undef, Proc => undef, Comment => undef,
200    Proc => \&_twig_pi_check_roots,
201    Default => sub {}, # hack needed for XML::Parser 2.27
202    ExternEnt => \&_twig_extern_ent,
203  );
204
205# handlers used when twig_roots and print_outside_roots are used and we are
206# outside of the roots
207my %twig_handlers_roots_print_2_30=
208  ( Start => \&_twig_start_check_roots,
209    End => \&_twig_end_check_roots,
210    Char => \&_twig_print,
211    Entity => \&_twig_print_entity,
212    ExternEnt => \&_twig_print_entity,
213    DoctypeFin => \&_twig_doctype_fin_print,
214    XMLDecl => \&_twig_print,
215    Doctype => \&_twig_print_doctype, # because recognized_string is broken here
216    # Element => \&_twig_print, Attlist => \&_twig_print,
217    CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
218    Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
219    Default => \&_twig_print_check_doctype,
220    ExternEnt => \&_twig_extern_ent,
221  );
222
223# handlers used when twig_roots, print_outside_roots and keep_encoding are used
224# and we are outside of the roots
225my %twig_handlers_roots_print_original_2_30=
226  ( Start => \&_twig_start_check_roots,
227    End => \&_twig_end_check_roots,
228    Char => \&_twig_print_original,
229    # I have no idea why I should not be using this handler!
230    Entity => \&_twig_print_entity,
231    ExternEnt => \&_twig_print_entity,
232    DoctypeFin => \&_twig_doctype_fin_print,
233    XMLDecl => \&_twig_print_original,
234    Doctype => \&_twig_print_original_doctype, # because original_string is broken here
235    Element => \&_twig_print_original, Attlist => \&_twig_print_original,
236    CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
237    Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
238    Default => \&_twig_print_original_check_doctype,
239  );
240
241# handlers used when twig_roots and print_outside_roots are used and we are
242# outside of the roots
243my %twig_handlers_roots_print_2_27=
244  ( Start => \&_twig_start_check_roots,
245    End => \&_twig_end_check_roots,
246    Char => \&_twig_print,
247    # if the Entity handler is set then it prints the entity declaration
248    # before the entire internal subset (including the declaration!) is output
249    Entity => sub {},
250    XMLDecl => \&_twig_print, Doctype => \&_twig_print,
251    CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
252    Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
253    Default => \&_twig_print,
254    ExternEnt => \&_twig_extern_ent,
255  );
256
257# handlers used when twig_roots, print_outside_roots and keep_encoding are used
258# and we are outside of the roots
259my %twig_handlers_roots_print_original_2_27=
260  ( Start => \&_twig_start_check_roots,
261    End => \&_twig_end_check_roots,
262    Char => \&_twig_print_original,
263    # for some reason original_string is wrong here
264    # this can be a problem if the doctype includes non ascii characters
265    XMLDecl => \&_twig_print, Doctype => \&_twig_print,
266    # if the Entity handler is set then it prints the entity declaration
267    # before the entire internal subset (including the declaration!) is output
268    Entity => sub {},
269    #Element => undef, Attlist => undef,
270    CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
271    Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
272    Default => \&_twig_print, # _twig_print_original does not work
273    ExternEnt => \&_twig_extern_ent,
274  );
275
276
277my %twig_handlers_roots_print= $parser_version > 2.27
278                               ? %twig_handlers_roots_print_2_30
279                               : %twig_handlers_roots_print_2_27;
280my %twig_handlers_roots_print_original= $parser_version > 2.27
281                               ? %twig_handlers_roots_print_original_2_30
282                               : %twig_handlers_roots_print_original_2_27;
283
284
285# handlers used when the finish_print method has been called
286my %twig_handlers_finish_print=
287  ( Start => \&_twig_print,
288    End => \&_twig_print, Char => \&_twig_print,
289    Entity => \&_twig_print, XMLDecl => \&_twig_print,
290    Doctype => \&_twig_print, Element => \&_twig_print,
291    Attlist => \&_twig_print, CdataStart => \&_twig_print,
292    CdataEnd => \&_twig_print, Proc => \&_twig_print,
293    Comment => \&_twig_print, Default => \&_twig_print,
294    ExternEnt => \&_twig_extern_ent,
295  );
296
297# handlers used when the finish_print method has been called and the keep_encoding
298# option is used
299my %twig_handlers_finish_print_original=
300  ( Start => \&_twig_print_original, End => \&_twig_print_end_original,
301    Char => \&_twig_print_original, Entity => \&_twig_print_original,
302    XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original,
303    Element => \&_twig_print_original, Attlist => \&_twig_print_original,
304    CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
305    Proc => \&_twig_print_original, Comment => \&_twig_print_original,
306    Default => \&_twig_print_original,
307  );
308
309# handlers used within ignored elements
310my %twig_handlers_ignore=
311  ( Start => \&_twig_ignore_start,
312    End => \&_twig_ignore_end,
313    Char => undef, Entity => undef, XMLDecl => undef,
314    Doctype => undef, Element => undef, Attlist => undef,
315    CdataStart => undef, CdataEnd => undef, Proc => undef,
316    Comment => undef, Default => undef,
317    ExternEnt => undef,
318  );
319
320
321# those handlers are only used if the entities are NOT to be expanded
322my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default );
323
324my @saved_default_handler;
325
326my $ID= 'id'; # default value, set by the Id argument
327
328# all allowed options
329%valid_option=
330    ( # XML::Twig options
331      TwigHandlers => 1, Id => 1,
332      TwigRoots => 1, TwigPrintOutsideRoots => 1,
333      StartTagHandlers => 1, EndTagHandlers => 1,
334      ForceEndTagHandlersUsage => 1,
335      DoNotChainHandlers => 1,
336      IgnoreElts => 1,
337      Index => 1,
338      CharHandler => 1,
339      TopDownHandlers => 1,
340      KeepEncoding => 1, DoNotEscapeAmpInAtts => 1,
341      ParseStartTag => 1, KeepAttsOrder => 1,
342      LoadDTD => 1, DTDHandler => 1,
343      DoNotOutputDTD => 1, NoProlog => 1,
344      ExpandExternalEnts => 1,
345      DiscardSpaces => 1, KeepSpaces => 1,
346      DiscardSpacesIn => 1, KeepSpacesIn => 1,
347      PrettyPrint => 1, EmptyTags => 1,
348      Quote => 'double',
349      Comments => 1, Pi => 1,
350      OutputFilter => 1, InputFilter => 1,
351      OutputTextFilter => 1,
352      OutputEncoding => 1,
353      RemoveCdata => 1,
354      EltClass => 1,
355      MapXmlns => 1, KeepOriginalPrefix => 1,
356      SkipMissingEnts => 1,
357      # XML::Parser options
358      ErrorContext => 1, ProtocolEncoding => 1,
359      Namespaces => 1, NoExpand => 1,
360      Stream_Delimiter => 1, ParseParamEnt => 1,
361      NoLWP => 1, Non_Expat_Options => 1,
362      Xmlns => 1,
363    );
364
365# predefined input and output filters
366
92
92
92
876
255
1061
use vars qw( %filter);
367%filter= ( html => \&html_encode,
368           safe => \&safe_encode,
369           safe_hex => \&safe_encode_hex,
370         );
371
372
373# trigger types (used to sort them)
374my ($XPATH_TRIGGER, $REGEXP_TRIGGER, $LEVEL_TRIGGER)=(1..3);
375
376sub new
377
2771
517522
  { my ($class, %args) = @_;
378
2771
7247
    my $handlers;
379
380    # change all nice_perlish_names into nicePerlishNames
381
2771
17537
    %args= _normalize_args( %args);
382
383    # check options
384
2771
22380
    unless( $args{MoreOptions})
385
2770
17437
      { foreach my $arg (keys %args)
386
5054
39551
        { carp "invalid option $arg" unless $valid_option{$arg}; }
387      }
388
389    # a twig is really an XML::Parser
390    # my $self= XML::Parser->new(%args);
391
2771
9224
    my $self;
392
2771
25744
    $self= XML::Parser->new(%args);
393
394
2771
478812
    bless $self, $class;
395
396
2771
20485
    $self->{_twig_context_stack}= [];
397
398
2771
18534
    if( exists $args{TwigHandlers})
399
162
815
      { $handlers= $args{TwigHandlers};
400
162
1083
        $self->setTwigHandlers( $handlers);
401
156
844
        delete $args{TwigHandlers};
402      }
403
404    # take care of twig-specific arguments
405
2765
17289
    if( exists $args{StartTagHandlers})
406
23
189
      { $self->setStartTagHandlers( $args{StartTagHandlers});
407
23
131
        delete $args{StartTagHandlers};
408      }
409
410
2765
15103
    if( exists $args{DoNotChainHandlers})
411
1
9
      { $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; }
412
413
2765
15387
    if( exists $args{IgnoreElts})
414      { # change array to hash so you can write ignore_elts => [ qw(foo bar baz)]
415
4
0
0
0
42
0
0
0
        if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; }
416
4
32
        $self->setIgnoreEltsHandlers( $args{IgnoreElts});
417
4
25
        delete $args{IgnoreElts};
418      }
419
420
2765
13958
    if( exists $args{Index})
421
2
9
      { my $index= $args{Index};
422        # we really want a hash name => path, we turn an array into a hash if necessary
423
2
13
        if( ref( $index) eq 'ARRAY')
424
1
2
5
16
          { my %index= map { $_ => $_ } @$index;
425
1
6
            $index= \%index;
426          }
427
2
18
        while( my( $name, $exp)= each %$index)
428
3
4
4
4
49
11
39
32
          { $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); }
429      }
430
431
2765
40228
    $self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
432
2765
62
15365
321
    if( exists( $args{EltClass})) { delete $args{EltClass}; }
433
434
2765
14037
    if( exists( $args{MapXmlns}))
435
15
104
      { $self->{twig_map_xmlns}= $args{MapXmlns};
436
15
79
        $self->{Namespaces}=1;
437
15
74
        delete $args{MapXmlns};
438      }
439
440
2765
14764
    if( exists( $args{KeepOriginalPrefix}))
441
4
29
      { $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
442
4
18
        delete $args{KeepOriginalPrefix};
443      }
444
445
2765
17772
    $self->{twig_dtd_handler}= $args{DTDHandler};
446
2765
10533
    delete $args{DTDHandler};
447
448
2765
16381
    if( $args{ExpandExternalEnts})
449
4
27
      { $self->set_expand_external_entities( 1);
450
4
29
        $self->{twig_expand_external_ents}= $args{ExpandExternalEnts};
451
4
22
        $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
452
4
29
        if( $args{ExpandExternalEnts} == -1)
453
1
6
          { $self->{twig_extern_ent_nofail}= 1;
454
1
27
            $self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail);
455          }
456
4
90
        delete $args{LoadDTD};
457
4
18
        delete $args{ExpandExternalEnts};
458      }
459    else
460
2761
16474
      { $self->set_expand_external_entities( 0); }
461
462
2765
30142
    if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP'))
463
0
0
      { $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler }
464    else
465
2765
21418
      { $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler }
466
467
2765
17894
    if( $args{DoNotEscapeAmpInAtts})
468
1
5
      { $self->set_do_not_escape_amp_in_atts( 1);
469
1
6
        $self->{twig_do_not_escape_amp_in_atts}=1;
470      }
471    else
472
2764
16412
      { $self->set_do_not_escape_amp_in_atts( 0);
473
2764
16607
        $self->{twig_do_not_escape_amp_in_atts}=0;
474      }
475
476    # deal with TwigRoots argument, a hash of elements for which
477    # subtrees will be built (and associated handlers)
478
479
2765
16086
    if( $args{TwigRoots})
480
63
477
      { $self->setTwigRoots( $args{TwigRoots});
481
61
900
        delete $args{TwigRoots};
482      }
483
484
2763
14594
    if( $args{EndTagHandlers})
485
11
125
      { unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
486
1
11
          { croak "you should not use EndTagHandlers without TwigRoots\n",
487                  "if you want to use it anyway, normally because you have ",
488                  "a start_tag_handlers that calls 'ignore' and you want to ",
489                  "call an ent_tag_handlers at the end of the element, then ",
490                  "pass 'force_end_tag_handlers_usage => 1' as an argument ",
491                  "to new";
492          }
493
494
10
79
        $self->setEndTagHandlers( $args{EndTagHandlers});
495
10
60
        delete $args{EndTagHandlers};
496      }
497
498
2762
14558
    if( $args{TwigPrintOutsideRoots})
499
34
229
      { croak "cannot use TwigPrintOutsideRoots without TwigRoots"
500          unless( $self->{twig_roots});
501        # if the arg is a filehandle then store it
502
33
199
        if( _is_fh( $args{TwigPrintOutsideRoots}) )
503
31
237
          { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
504
33
286
        $self->{twig_default_print}= $args{TwigPrintOutsideRoots};
505      }
506
507    # space policy
508
2761
14503
    if( $args{KeepSpaces})
509
18
112
      { croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces});
510
17
100
        croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
511
16
76
        $self->{twig_keep_spaces}=1;
512
16
81
        delete $args{KeepSpaces};
513      }
514
2759
14635
    if( $args{DiscardSpaces})
515
2
16
      { croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
516
1
6
        $self->{twig_discard_spaces}=1;
517
1
4
        delete $args{DiscardSpaces};
518      }
519
2758
14210
    if( $args{KeepSpacesIn})
520
8
53
      { croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn});
521
7
37
        $self->{twig_discard_spaces}=1;
522
7
45
        $self->{twig_keep_spaces_in}={};
523
7
7
24
51
        my @tags= @{$args{KeepSpacesIn}};
524
7
9
34
82
        foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
525
7
39
        delete $args{KeepSpacesIn};
526      }
527
2757
14746
    if( $args{DiscardSpacesIn})
528
4
18
      { $self->{twig_keep_spaces}=1;
529
4
25
        $self->{twig_discard_spaces_in}={};
530
4
4
12
29
        my @tags= @{$args{DiscardSpacesIn}};
531
4
6
21
52
        foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
532
4
21
        delete $args{DiscardSpacesIn};
533      }
534    # discard spaces by default
535
2757
24463
    $self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces});
536
537
2757
18563
    $args{Comments}||= $COMMENTS_DEFAULT;
538
2757
3
30632
18
    if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; }
539
1868
10598
    elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; }
540
885
4872
    elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
541
1
10
    else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
542
2756
12001
    delete $args{Comments};
543
544
2756
17048
    $args{Pi}||= $PI_DEFAULT;
545
2756
2
28977
12
    if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; }
546
1870
10552
    elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; }
547
883
4841
    elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
548
1
10
    else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
549
2755
11430
    delete $args{Pi};
550
551
2755
15011
    if( $args{KeepEncoding})
552      {
553        # set it in XML::Twig::Elt so print functions know what to do
554
968
5381
        $self->set_keep_encoding( 1);
555
968
14196
        $self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
556
968
5942
        delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
557
968
4553
        delete $args{KeepEncoding};
558      }
559    else
560
1787
10371
      { $self->set_keep_encoding( 0);
561
1787
11938
        $self->{parse_start_tag}= $args{ParseStartTag} if( $args{ParseStartTag});
562      }
563
564
2755
14617
    if( $args{OutputFilter})
565
5
35
      { $self->set_output_filter( $args{OutputFilter});
566
5
24
        delete $args{OutputFilter};
567      }
568    else
569
2750
15096
      { $self->set_output_filter( 0); }
570
571
2755
16335
    if( $args{RemoveCdata})
572
1
9
      { $self->set_remove_cdata( $args{RemoveCdata});
573
1
5
        delete $args{RemoveCdata};
574      }
575    else
576
2754
14858
      { $self->set_remove_cdata( 0); }
577
578
2755
15151
    if( $args{OutputTextFilter})
579
5
33
      { $self->set_output_text_filter( $args{OutputTextFilter});
580
5
25
        delete $args{OutputTextFilter};
581      }
582    else
583
2750
13959
      { $self->set_output_text_filter( 0); }
584
585
586
2755
16807
    if( exists $args{KeepAttsOrder})
587
7
50
      { $self->{keep_atts_order}= $args{KeepAttsOrder};
588
7
37
        if( _use( 'Tie::IxHash'))
589
6
52
          { $self->set_keep_atts_order( $self->{keep_atts_order}); }
590        else
591
1
5
          { croak "Tie::IxHash not available, option keep_atts_order not allowed"; }
592      }
593    else
594
2748
15527
      { $self->set_keep_atts_order( 0); }
595
596
597
2754
42
15586
320
    if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); }
598
2754
1
13870
7
    if( $args{Quote}) { $self->set_quote( $args{Quote}); }
599
2754
12
14174
81
    if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) }
600
601
2754
1
1
15558
6
5
    if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; }
602
2754
3
3
14759
18
15
    if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; }
603
2754
2
2
13780
13
10
    if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; }
604
2754
4
4
14723
27
20
    if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; }
605
2754
1
1
14620
8
5
    if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; }
606
607
2754
3
1
14015
28
5
    if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; }
608
2752
1
1
14460
9
120
    if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; }
609
2752
1
1
15935
7
6
    if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; }
610
611
2752
4
4
15705
21
17
    if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; }
612
613    # set handlers
614
2752
16333
    if( $self->{twig_roots})
615
61
355
      { if( $self->{twig_default_print})
616
33
203
          { if( $self->{twig_keep_encoding})
617
7
130
              { $self->setHandlers( %twig_handlers_roots_print_original); }
618            else
619
26
439
              { $self->setHandlers( %twig_handlers_roots_print); }
620          }
621        else
622
28
451
          { $self->setHandlers( %twig_handlers_roots); }
623      }
624    else
625
2691
41575
      { $self->setHandlers( %twig_handlers); }
626
627    # XML::Parser::Expat does not like these handler to be set. So in order to
628    # use the various sets of handlers on XML::Parser or XML::Parser::Expat
629    # objects when needed, these ones have to be set only once, here, at
630    # XML::Parser level
631
2752
1602325
    $self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
632
633
2752
306704
    $self->{twig_entity_list}= XML::Twig::Entity_list->new;
634
635
2752
14926
    $self->{twig_id}= $ID;
636
2752
15295
    $self->{twig_stored_spaces}='';
637
638
2752
14941
    $self->{twig_autoflush}= 1; # auto flush by default
639
640
2752
13459
    $self->{twig}= $self;
641
2752
29543
    weaken( $self->{twig}) if( $weakrefs);
642
643
2752
20113
    return $self;
644  }
645
646sub parse
647  {
648
2810
73710
    my $t= shift;
649    # if called as a class method, calls nparse, which creates the twig then parses it
650
2810
35
50815
292
    if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); }
651
652    # requires 5.006 at least (or the ${^UNICODE} causes a problem) # > perl 5.5
653    # trap underlying bug in IO::Handle (see RT #17500) # > perl 5.5
654    # croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > perl 5.5
655
2775
51117
    if( $]>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] ) # > perl 5.5
656
0
0
      { croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > perl 5.5
657              . "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > perl 5.5
658              . "not to include 'D'"; # > perl 5.5
659      } # > perl 5.5
660
2775
2775
9388
36513
    $t= eval { $t->SUPER::parse( @_); };
661
2775
123951
    return _checked_parse_result( $t, $@);
662  }
663
664sub parsefile
665
27
122
  { my $t= shift;
666
27
27
81
457
    $t= eval { $t->SUPER::parsefile( @_); };
667
27
391
    return _checked_parse_result( $t, $@);
668  }
669
670sub _checked_parse_result
671
2802
15453
  { my( $t, $returned)= @_;
672
2802
14289
    if( !$t)
673
22
302
      { if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now})
674
6
19
          { $t= $returned;
675
6
28
            delete $t->{twig_finish_now};
676
6
35
            return $t->_twig_final;
677          }
678        else
679
16
100
          { _croak( $returned, 0); }
680      }
681
2780
22229
    return $t;
682