1: #!/usr/local/bin/perl
2: # Copyright (c) 1998 Robert Braddock. All rights reserved.
3: # This program is free software; you can redistribute it and/or
4: # modify it under the same terms as Perl itself.
6: package SGML::ElementMap::Driver::XMLParser;
7: use strict;
9: use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION $REVISION); BEGIN {
10: @ISA = qw( SGML::ElementMap::Driver::EventQueue );
11: @EXPORT = ();
12: @EXPORT_OK = ();
13: %EXPORT_TAGS = ();
14: $VERSION = 0.2;
15: $REVISION = q$Revision: 1.2 $;
16: }
17: use SGML::ElementMap::Driver::EventQueue ();
18: use Exporter;
21: BEGIN {
22: SGML::ElementMap::Driver::set_obj_constants('IDX',&SGML::ElementMap::Driver::EventQueue::IDX_LAST(), qw(nb_interface source offset));
23: sub IDX_parser () { return &SGML::ElementMap::Driver::EventQueue::IDX_parser(); }
24: sub IDX_parse_queue () { return &SGML::ElementMap::Driver::EventQueue::IDX_parse_queue(); }
25: sub IDX_input_mode () { return &SGML::ElementMap::Driver::EventQueue::IDX_input_mode(); }
26: sub IDX_markup_mode () { return &SGML::ElementMap::Driver::EventQueue::IDX_markup_mode(); }
27: }
29: use Carp;
30: use Symbol;
31: use lib ('/usr/lib/perl5/XML/Parser');
32: use XML::Parser;
33: use XML::Parser::Expat;
35: my $debug = 0;
36: my $trace = 0;
37: my $trace_hnd = 0;
39: my $buf_size = 32;
41: eval "use Data::Dumper;
42: $Data::Dumper::Indent = $Data::Dumper::Indent = 0;
43: $Data::Dumper::Terse = $Data::Dumper::Terse = 1;" if $debug;
45: if (!caller()) { die "$0: perl module, not program\n"; }
47: # create empty object
48: # no arguments means default parser setup
49: sub new {
50: my $proto = shift;
51: my $class = ref($proto) || $proto;
53: carp __PACKAGE__.' does not know what to do with arguments' if @_;
54: my $self = $class->SUPER::new();
55: $self->[&IDX_parser()] = '';
56: $self->[&IDX_nb_interface()] = '';
57: $self->[&IDX_source()] = '';
58: $self->[&IDX_offset()] = '';
59:
60: warn '--NEW from '.join(', line ',(caller())[1,2])."\n" if $trace;
61:
62: $self = bless $self,$class;
63: return $self;
64: }
66: sub DESTROY {
67: my $p = $_[0]->[&IDX_parser()];
68: $_[0]->[&IDX_parser()] = '';
69: warn '--DESTROY from '.join(', line ',(caller())[1,2])."\n" if $trace;
70: if (defined($p) && ref($p) && $p->isa('XML::Parser::Expat')) {
71: $p->release;
72: }
73: }
75: sub parser {
76: my ($self,$parser) = @_;
77:
78: warn '--PARSER from '.join(', line ',(caller())[1,2])."\n" if $trace;
79: my @handlers = ( 'Start' => $self->can( 'handle_start' ),
80: 'End' => $self->can( 'handle_end' ),
81: 'Char' => $self->can( 'handle_char' ),
82: 'Proc' => $self->can( 'handle_proc' ),
83: 'Comment' => $self->can( 'handle_comment' ),
84: 'CdataStart' => $self->can( 'handle_cdata_start' ),
85: 'CdataEnd' => $self->can( 'handle_cdata_end' ),
86: 'Default' => $self->can( 'handle_default' ),
87: );
88: #warn "HANDLERS".join('][',@handlers)."\n";
89: my @args = (
90: #'ProtocolEncoding' => 'UTF-8',
91: 'Namespaces' => 1,
92: 'ParseParamEnt' => 1,
93: );
94:
95: if (defined($parser) && ref($parser)) {
96: %{ $parser } = @args if @args;
97: $parser->setHandlers(@handlers);
98: #->{'Handlers'} = { 'Start' => \&handle_start,
99: #'End' => \&handle_end,
100: #'not finished -- intentional error', };
101: } elsif (defined($parser) && $parser) {
102: if ($parser eq 'XML::Parser::ExpatNB') {
103: $parser = $parser->new( #'Handlers'=>{@handlers},
104: @args);
105: } else {
106: my ($s,$parser) = eval qq{ use $parser ();
107: (1, $parser\->new(\@args)); };
108: warn $@ unless defined $s;
109: }
110: return undef unless $parser || '';
111: $parser->setHandlers(@handlers);
112: }
113: return $self->SUPER::parser($parser);
114: }
116: #start new traversal (for use by users)
117: #passed main module as first (normal) argument, rest are process specific
118: sub process {
119: my ($self,$main,$source,@list) = @_;
120:
121: warn '--PROCESS from '.join(', line ',(caller())[1,2])."\n" if $trace;
122: if ($self->[&IDX_markup_mode()] eq 'sgml') {
123: croak __PACKAGE__." can only parse XML documents";
124: } elsif ($self->[&IDX_markup_mode()] eq 'xml') {
125: if (! $self->[&IDX_parser()]) { # make default parser object
126: croak "failed to instantiate parser" unless
127: defined $self->parser('XML::Parser::ExpatNB');
128: }
129: } else {
130: croak "unknown markup mode '".$self->[&IDX_markup_mode()]."'\n";
131: }
133: my $input_type = $self->[&IDX_input_mode()];
134: if (!$input_type || $input_type eq 'default') { # infer type
135: if (ref($source) eq 'SCALAR') {
136: $input_type = 'literal';
137: } elsif (ref $source) {
138: $input_type = 'handle';
139: } else {
140: $input_type = 'file';
141: }
142: }
144: if ($input_type eq 'file' || $input_type eq 'path') {
145: my $fh = new IO::File($source,'r');
146: croak "failed to open $source: $!\n" unless defined $fh;
147: $source = $fh;
148: $input_type = 'handle';
149: } # (not else)
150: if ($input_type eq 'handle') {
151: $self->[&IDX_source()] = $source;
152: $self->[&IDX_offset()] = '';
153: } elsif ($input_type eq 'literal' || $input_type eq 'string') {
154: $self->[&IDX_offset()] = 0;
155: $self->[&IDX_source()] = ref($source) ? $source : \$source;
156: } else {
157: die "unknown input type '".$input_type."'";
158: }
160: # ready the parser
161: my $parser = $self->[&IDX_parser()];
162: return undef unless
163: $self->[&IDX_nb_interface()] = $parser; #->parse_start();
165: $parser->{__PACKAGE__} = $self; # need access from handlers
167: @list = $self->SUPER::process($main,'',@list);
169: # break circular ref
170: $parser->{__PACKAGE__} = '';# = $self;
172: warn __PACKAGE__."return from process" if $debug;
173: return @list;
174: }
176: # override superclass
177: sub process_sgml_file {
178: my ($self,$main,$file,@list) = @_;
179: croak "XML::Parser::Expat driver only handles XML documents";
180: }
182: sub markup {
183: my ($self,$mode) = @_;
184: if (defined($mode) && lc($mode) eq 'sgml') {
185: return undef;
186: #croak __PACKAGE__." can only parse XML documents";
187: } else {
188: return $self->SUPER::markup($mode);
189: }
190: }
192: sub trace {
193: my ($arg) = @_;
194: if (defined $arg) {
195: $debug = $arg;
196: } else {
197: $debug = 1;
198: }
199: }
202: # parse queue needs more stuff
203: sub queue_more_events {
204: my ($self,$queue) = @_;
205: my $buf = '';
206: my ($offset) = \@{ $self }[&IDX_offset()];
207: my ($parser, $source) = @{ $self }[&IDX_parser(), &IDX_source()];
208: while (@$queue == 0) {
209: croak "parse attempt after parsing already finished" unless $parser;
210: # TODO
211: # can use $parser->skip_until(INDEX) when doing an 'until close' skip
212:
213: SCAN: {
214: if ($$offset ne '') { # source is a string ref
215: last SCAN unless length($$source) > $$offset;
216: $buf = substr($$source, $$offset, $buf_size);
217: $$offset += $buf_size;
218: } else { # source is a IO::Handle
219: die "lost source handle: $!" unless $source;
220: last SCAN if $source->eof;
221: my $s = $source->read($buf,$buf_size);
222: die "TODO: some error occurred (maybe $!)" unless $s;
223: }
224: }
225: if ($buf) {
226: warn "PARSE[$$offset][$source][".$buf."]\n" if $trace_hnd;
227: $parser->parse_more($buf);
228: #ERROR CODE?
229:
230: } else {
231: #warn 'CALL PARSE DONE' if $trace;
232: #push @{ $self->{'parse queue'} }, ('end','','');
233: push @{ $self->[&IDX_parse_queue()] }, ('','conforming','');
234: $parser->parse_done(); #ERROR CODE?
235: $self->[&IDX_parser()] = ''; # Expat kills itself here, so lose reference
236: return 0;
237: }
238: }
239: return 1;
240: }
243: # object, name, [name,val, ...]
244: sub handle_start {
245: my ($parser,$element,@attrs) = @_;
246: my $spec_count = ($parser->specified_attr || 0) / 2; # total count
247: warn "START:$element:[[$spec_count]]:".join(':',@attrs)."\n" if $trace_hnd;
248: my $self = $parser->{__PACKAGE__};
249: warn '--HANDLE_START from '.join(', line ',(caller())[1,2])."\n" if $trace;
251: my @names = ();
252: for(my $i=0; $i<@attrs; $i+=2) {
253: #last unless $i < 2*$spec_count;
254: push @names, $attrs[$i]; # attr name
255: }
256: my $obj = {'Type' => 'Element',
257: 'Name' => $element,
258: 'Attributes' => { @attrs },
259: 'AttributeOrder' => [ @names ],
260: 'Defaulted' => $spec_count };
262: push @{$self->[&IDX_parse_queue()]}, ('start_element',
263: $obj,
264: '',);
265: }
267: # obj, name
268: sub handle_end {
269: my ($parser, $element) = @_;
270: warn "END:$element\n" if $trace_hnd;
271: my $self = $parser->{__PACKAGE__};
272: push @{$self->[&IDX_parse_queue()]},('end_element',
273: scalar($self->nsname($element)),
274: '');
275: #$parser->original_string()];
276: }
278: #obj, utf8-string
279: sub handle_char {
280: my ($parser,$text) = @_;
281: warn "CHAR:$text\n" if $trace_hnd;
282: my $self = $parser->{__PACKAGE__};
283: #$text=$self->encode($text);
284: my $queue = $self->[&IDX_parse_queue()];
285: #join with previous event if it's text too
286: #if (@$queue > 0 && $queue->[$#$queue-2] eq 'cdata') {
287: # $queue->[$#$queue-1] .= $text;
288: # #$self->{'parse queue'}[-1][-1].=$parser->original_string();
289: #} else {
290: push @$queue,('cdata',{'Data'=>($text || '')},'');
291: #}
292: }
295: sub handle_proc {
296: my ($parser,$target,$value) = @_;
297: my $self = $parser->{__PACKAGE__};
298: push @{$self->[&IDX_parse_queue()]},('pi',
299: { 'Target' => ($target || ''),
300: 'Data' => ($value||'') },
301: '');
302: }
304: sub handle_comment {
305: my ($parser,$text) = @_;
306: my $self=$parser->{__PACKAGE__};
307: push @{$self->[&IDX_parse_queue()]},('comment', {'Data'=>($text||'')}, '');
308: }
310: sub handle_cdata_start {
311: my ($parser) = @_;
312: my $self=$parser->{__PACKAGE__};
313: warn "ignoring cdata end\n" if $trace_hnd;
314: #push @{$self->{output}},
315: }
317: sub handle_cdata_end {
318: my ($parser) = @_;
319: my $self = $parser->{__PACKAGE__};
320: warn "ignoring cdata end\n" if $trace_hnd;
321: #push @{$self->{output}},
322: }
324: # any chars that aren't fed to another handler (including markup chars)
325: sub handle_default {
326: my ($parser,$string) = @_;
327: my $self = $parser->{__PACKAGE__};
328: warn "ignoring [".$string."]\n" if $trace_hnd;
329: #push @{$self->{output}},
330: }
335: sub nsname {
336: my ($self,$name) = @_;
337: my $parser = $self->[&IDX_parser()];
338: my $ns_uri = '';
339: if ($parser->{'Namespaces'}) {
340: $ns_uri = $parser->namespace($name) || '';
341: }
342: $name = $self->encode($name);
343: return wantarray ? ($name, $ns_uri) : $name;
344: }
346: # all chars in UTF8 -- so?
347: sub encode {
348: my ($self,$text)=@_;
349: return $text;
350: #if ($self->{latin}) {
351: # $text=~s{([\xc0-\xc3])(.)}{
352: # my $hi = ord($1);
353: # my $lo = ord($2);
354: # chr((($hi & 0x03) <<6) | ($lo & 0x3F))
355: # }ge;
356: #}
357: #$text;
358: }
360: 1;
362: __END__
364: CdataStart (Parser)
365: This is called at the start of a CDATA section.
367: CdataEnd (Parser)
368: This is called at the end of a CDATA section.
370: Default (Parser, String)
371: This is called for any characters that don't have
372: a registered handler. This includes both charac
373: ters that are part of markup for which no events
374: are generated (markup declarations) and characters
375: that could generate events, but for which no han
376: dler has been registered.
378: Whatever the encoding in the original document,
379: the string is returned to the handler in UTF-8.
381: Unparsed (Parser, Entity, Base, Sysid,
382: Pubid, Notation)
383: This is called for a declaration of an unparsed
384: entity. Entity is the name of the entity. Base is
385: the base to be used for resolving a relative URI.
386: Sysid is the system id. Pubid is the public id.
387: Notation is the notation name. Base and Pubid may
388: be undefined.
390: Notation (Parser, Notation, Base, Sysid,
391: Pubid)
392: This is called for a declaration of notation.
393: Notation is the notation name. Base is the base
394: to be used for resolving a relative URI. Sysid is
395: the system id. Pubid is the public id. Base,
396: Sysid, and Pubid may all be undefined.
399: ExternEnt (Parser, Base, Sysid, Pubid)
400: This is called when an external entity is refer
401: enced. Base is the base to be used for resolving a
402: relative URI. Sysid is the system id. Pubid is the
403: public id. Base, and Pubid may be undefined.
405: This handler should either return a string, which
406: represents the contents of the external entity, or
407: return an open filehandle that can be read to
408: obtain the contents of the external entity, or
409: return undef, which indicates the external entity
410: couldn't be found and will generate a parse error.
412: If an open filehandle is returned, it must be
413: returned as either a glob (*FOO) or as a reference
414: to a glob (e.g. an instance of IO::Handle).
416: ExternEntFin (Parser)
417: This is called after an external entity has been
418: parsed. It allows applications to perform cleanup
419: on actions performed in the above ExternEnt han
420: dler.
422: Entity (Parser, Name, Val, Sysid, Pubid,
423: Ndata, IsParam)
424: This is called when an entity is declared. For
425: internal entities, the Val parameter will contain
426: the value and the remaining three parameters will
427: be undefined. For external entities, the Val
428: parameter will be undefined, the Sysid parameter
429: will have the system id, the Pubid parameter will
430: have the public id if it was provided (it will be
431: undefined otherwise), the Ndata parameter will
432: contain the notation for unparsed entities. If
433: this is a parameter entity declaration, then the
434: IsParam parameter is true.
436: Note that this handler and the Unparsed handler
437: above overlap. If both are set, then this handler
438: will not be called for unparsed entities.
440: Element (Parser, Name, Model)
441: The element handler is called when an element dec
442: laration is found. Name is the element name, and
443: Model is the content model as an XML::Parser::Con
444: tentModel object. See the section on
445: "XML::Parser::ContentModel Methods" for methods
446: available for this class.
448: Attlist (Parser, Elname, Attname, Type,
449: Default, Fixed)
450: This handler is called for each attribute in an
451: ATTLIST declaration. So an ATTLIST declaration
452: that has multiple attributes will generate multi
453: ple calls to this handler. The Elname parameter is
454: the name of the element with which the attribute
455: is being associated. The Attname parameter is the
456: name of the attribute. Type is the attribute type,
457: given as a string. Default is the default value,
458: which will either be "#REQUIRED", "#IMPLIED" or a
459: quoted string (i.e. the returned string will begin
460: and end with a quote character). If Fixed is true,
461: then this is a fixed attribute.
465: Doctype (Parser, Name, Sysid, Pubid,
466: Internal)
467: This handler is called for DOCTYPE declarations.
468: Name is the document type name. Sysid is the sys
469: tem id of the document type, if it was provided,
470: otherwise it's undefined. Pubid is the public id
471: of the document type, which will be undefined if
472: no public id was given. Internal will be true or
473: false, indicating whether or not the doctype dec
474: laration contains an internal subset.
476: DoctypeFin (Parser)
477: This handler is called after parsing of the DOC
478: TYPE declaration has finished, including any
479: internal or external DTD declarations.
481: XMLDecl (Parser, Version, Encoding, Stan
482: dalone)
483: This handler is called for XML declarations. Ver
484: sion is a string containg the version. Encoding is
485: either undefined or contains an encoding string.
486: Standalone is either undefined, or true or false.
487: Undefined indicates that no standalone parameter
488: was given in the XML declaration. True or false
489: indicates "yes" or "no" respectively.