Html.pm
NAME
Communiware::Datatype::Html - implements HTML datatype.
SYNOPSIS
Implements datatype which is stored as html, but edited as text, rtf or html.
DESCRIPTION
HTML datatype is most common datatype in Communiware. It can be uploaded as plain text with special emphasis chars, rtf files (only via upload method) and as is. After that it is passed through syntax checker during upload.
img src and link href are preprocessed to find references to other communiware filters. USES argument is added to attrs.
FUNCIONS
save
Distinguishes between HTML and Communiware rich text input.
Calls procedures from Communiware::Format::<etc>
Uses options FORMAT for formatting text (see Communiware::Text docs) NO_LINT to normalize HTML instead of manual correction.
Format defaults to strict paragrapgs.
In case of HTML, calls the do_html manpage routine described below.
upload
Distinguishes between text,html and rtf.
If rtf, then converts it into html and calls the do_html manpage routine Otherwise, reads text in the scalar variable and calls save.
get
Adds correct HTML head. We must process representation validation outside of this procedure
methods
No additional methods defined
options
return list of options allowed for this content type - FORMAT, NO_LINT Note, it returns names of options as passed in cgi param names.
INTERNAL PROCEDURES
do_html
Recieves item_id, text, list of attributes and flag, indicating that html validation step should be fixed.
Validates (if not forbidden) html, rewrites links to pictures and cmw: scheme urls and returns content of <BODY> tag.
set_errorInfo
syntax
set_errorInfo(filename,linenumber);
Should be called before get or upload methods. Sets file name which should be reported in HTML validator messages and line number which should be added to line number reported by validator.
This information would be cleared out upon completing validation
preprocess_rtf
parameters item_id and filehandle reference Return text in html format. Tries to extract pictures.
normalize_html This sub is an attempt to normalize bad html It takes html text and returns corrected (possibly the same author wanted) version of it. =cut
{ package MyParser; require HTML::Element; use base 'HTML::TreeBuilder';
use constant DEBUG => 0;
# Следующий фрагмент (функция text) взят "как есть" из текста модуля # HTML::TreeBuilder, за исключением того, что в нем закомментарен фрагмент, # помеченный, как *** for Communiware ***, который выполняет "лишнюю" # для наших целей раскодировку entities.
my($indent, $nugget);
sub text { return if $_[0]{'_stunted'}; # Accept a "here's a text token" signal from HTML::Parser. my($self, $text, $is_cdata) = @_; # the >3.0 versions of Parser may pass a cdata node. # Thanks to Gisle Aas for pointing this out. return unless length $text; # I guess that's always right my $ignore_text = $self->{'_ignore_text'}; my $no_space_compacting = $self->{'_no_space_compacting'}; my $pos = $self->{'_pos'} || $self; # *** for Communiware *** # HTML::Entities::decode($text) # unless $ignore_text || $is_cdata # || $HTML::Tagset::isCDATA_Parent{$pos->{'_tag'}}; # *** end for Communiware *** #my($indent, $nugget); if(DEBUG) { # optimization -- don't figure out depth unless we're in debug mode my @lineage_tags = $pos->lineage_tag_names; $indent = ' ' x (1 + @lineage_tags); $nugget = (length($text) <= 25) ? $text : (substr($text,0,25) . '...'); $nugget =~ s<([\x00-\x1F])> <'\\x'.(unpack("H2",$1))>eg; print $indent, "Proposing a new text node ($nugget) under ", join('/', reverse($pos->{'_tag'}, @lineage_tags)) || 'Root', ".\n"; #} else { # $indent = ' '; } my $ptag; if ($HTML::Tagset::isCDATA_Parent{$ptag = $pos->{'_tag'}} #or $pos->is_inside('pre') or $pos->is_inside('pre', 'textarea') ) { return if $ignore_text; $pos->push_content($text); } else { # return unless $text =~ /\S/; # This is sometimes wrong if (!$self->{'_implicit_tags'} || $text !~ /\S/) { # don't change anything } elsif ($ptag eq 'head' or $ptag eq 'noframes') { if($self->{'_implicit_body_p_tag'}) { print $indent, " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" if DEBUG > 1; $self->end(\$ptag); $pos = $self->{'_body'} ? ($self->{'_pos'} = $self->{'_body'}) # expected case : $self->insert_element('body', 1); $pos = $self->insert_element('p', 1); } else { print $indent, " * Text node under \U$ptag\E closes, implicates BODY.\n" if DEBUG > 1; $self->end(\$ptag); $pos = $self->{'_body'} ? ($self->{'_pos'} = $self->{'_body'}) # expected case : $self->insert_element('body', 1); } } elsif ($ptag eq 'html') { if($self->{'_implicit_body_p_tag'}) { print $indent, " * Text node under HTML implicates BODY and P.\n" if DEBUG > 1; $pos = $self->{'_body'} ? ($self->{'_pos'} = $self->{'_body'}) # expected case : $self->insert_element('body', 1); $pos = $self->insert_element('p', 1); } else { print $indent, " * Text node under HTML implicates BODY.\n" if DEBUG > 1; $pos = $self->{'_body'} ? ($self->{'_pos'} = $self->{'_body'}) # expected case : $self->insert_element('body', 1); #print "POS is $pos, ", $pos->{'_tag'}, "\n"; } } elsif ($ptag eq 'body') { if($self->{'_implicit_body_p_tag'}) { print $indent, " * Text node under BODY implicates P.\n" if DEBUG > 1; $pos = $self->insert_element('p', 1); } } elsif ($ptag eq 'table') { print $indent, " * Text node under TABLE implicates TR and TD.\n" if DEBUG > 1; $self->insert_element('tr', 1); $pos = $self->insert_element('td', 1); # double whammy! } elsif ($ptag eq 'tr') { print $indent, " * Text node under TR implicates TD.\n" if DEBUG > 1; $pos = $self->insert_element('td', 1); } # elsif ( # # $ptag eq 'li' || # # $ptag eq 'dd' || # $ptag eq 'form') { # $pos = $self->insert_element('p', 1); #} # Whatever we've done above should have had the side # effect of updating $self->{'_pos'} #print "POS is now $pos, ", $pos->{'_tag'}, "\n"; return if $ignore_text; $text =~ s/\s+/ /g unless $no_space_compacting ; # canonical space print $indent, " (Attaching text node ($nugget) under ", # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, $pos->{'_tag'}, ").\n" if DEBUG > 1; $pos->push_content($text); } &{ $self->{'_tweak_~text'} || return }($text, $pos, $pos->{'_tag'} . ''); # Note that this is very exceptional -- it doesn't fall back to # _tweak_*, and it gives its tweak different arguments. return; } }
sub normalize_html { | |
local $/; | |
undef $/; | |
my $html = shift; |
my $tree = MyParser->new; $tree->p_strict(1); $tree->store_comments(1); $tree->no_space_compacting(1); $tree->ignore_ignorable_whitespace(0);
# $tree->implicit_body_p_tag(1); $tree->parse($html); $tree->eof(); #Признак прекращения потокового ввода
my @tag_list;
foreach my $forb (@Communiware::Datatype::Html::forbidden_tag) { @tag_list = $tree->find_by_tag_name($forb);
foreach my $cur_tag (@tag_list) { $cur_tag->delete; } }
$html = $tree->as_HTML("<>\240", undef, {}); $tree->delete();
my $replacement = $Communiware::Charset::replace_string; $replacement =~ s/[Ёё]//g;
$html =~ s/([$replacement])/$Communiware::Charset::escaping{$1}/g; return $html; }
1;