2007-10-08 13:48:33大D QQ

string parse

perl sample

#!/usr/bin/perl -w

# Extract all plain text from an HTML file

use strict;
use HTML::Parser 3.00 ();

my %inside;

sub tag
{
my($tag, $num) = @_;
$inside{$tag} += $num;
print ” ”; # not for all tags
}

sub text
{
return if $inside{script} || $inside{style};
print $_[0];
}

HTML::Parser->new(api_version => 3,
handlers => [start => [&tag, ”tagname, ’+1’”],
end => [&tag, ”tagname, ’-1’”],
text => [&text, ”dtext”],
],
marked_sections => 1,
)->parse_file(shift) || die ”Can’t open file: $!n”;;

PHP sample

Html Parser Class

This is a HTML parser class, used to parse HTML and XML. One of the unique features of this class is that it supports the innerHTML property.


/**
* HTML/XML Parser Class
*
* This is a helper class that is used to parse HTML and XML. A unique feature of this parsing class
* is the fact that it includes support for innerHTML (which isn’’t easy to do).
*
* @author Dennis Pallett
* @copyright Dennis Pallett 2006
* @package HTML_Parser
* @version 1.0
*/

// Helper Class
// To parse HTML/XML
Class HTML_Parser {
// Private properties
var $_parser;
var $_tags = array();
var $_html;
var $output = array();
var $strXmlData;
var $_level = 0;
var $_outline;
var $_tagcount = array();
var $xml_error = false;
var $xml_error_code;
var $xml_error_string;
var $xml_error_line_number;

function get_html () {
return $this->_html;
}

function parse($strInputXML) {
$this->output = array();

// Translate entities
$strInputXML = $this->translate_entities($strInputXML);

$this->_parser = xml_parser_create ();
xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true);
xml_set_object($this->_parser,$this);
xml_set_element_handler($this->_parser, ”tagOpen”, ”tagClosed”);

xml_set_character_data_handler($this->_parser, ”tagData”);

$this->strXmlData = xml_parse($this->_parser,$strInputXML );

if (!$this->strXmlData) {
$this->xml_error = true;
$this->xml_error_code = xml_get_error_code($this->_parser);
$this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser));
$this->xml_error_line_number = xml_get_current_line_number($this->_parser);
return false;
}

return $this->output;
}


function tagOpen($parser, $name, $attr) {
// Increase level
$this->_level++;

// Create tag:
$newtag = $this->create_tag($name, $attr);

// Build tag
$tag = array(”name”=>$name,”attr”=>$attr, ”level”=>$this->_level);

// Add tag
array_push ($this->output, $tag);

// Add tag to this level
$this->_tags[$this->_level] = $tag;

// Add to HTML
$this->_html .= $newtag;

// Add to outline
$this->_outline .= $this->_level . $newtag;
}

function create_tag ($name, $attr) {
// Create tag:
# Begin with name
$tag = ’’<’’ . strtolower($name) . ’’ ’’;

# Create attribute list
foreach ($attr as $key=>$val) {
$tag .= strtolower($key) . ’’=”’’ . htmlentities($val) . ’’” ’’;
}

# Finish tag
$tag = trim($tag);

switch(strtolower($name)) {
case ’’br’’:
case ’’input’’:
$tag .= ’’ /’’;
break;
}

$tag .= ’’>’’;

return $tag;
}

function tagData($parser, $tagData) {
if(trim($tagData)) {
if(isset($this->output[count($this->output)-1][’’tagData’’])) {
$this->output[count($this->output)-1][’’tagData’’] .= $tagData;
} else {
$this->output[count($this->output)-1][’’tagData’’] = $tagData;
}
}

$this->_html .= htmlentities($tagData);
$this->_outline .= htmlentities($tagData);
}

function tagClosed($parser, $name) {
// Add to HTML and outline
switch (strtolower($name)) {
case ’’br’’:
case ’’input’’:
break;
default:
$this->_outline .= $this->_level . ’’’’;
$this->_html .= ’’’’;
}

// Get tag that belongs to this end
$tag = $this->_tags[$this->_level];
$tag = $this->create_tag($tag[’’name’’], $tag[’’attr’’]);

// Try to get innerHTML
$regex = ’’%’’ . preg_quote($this->_level . $tag, ’’%’’) . ’’(.*?)’’ . preg_quote($this->_level . ’’’’, ’’%’’) . ’’%is’’;
preg_match ($regex, $this->_outline, $matches);

// Get innerHTML
if (isset($matches[’’1’’])) {
$innerhtml = $matches[’’1’’];
}

// Remove level identifiers
$this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
$this->_outline = str_replace($this->_level . ’’’’, ’’’’, $this->_outline);

// Add innerHTML
if (isset($innerhtml)) {
$this->output[count($this->output)-1][’’innerhtml’’] = $innerhtml;
}

// Fix tree
$this->output[count($this->output)-2][’’children’’][] = $this->output[count($this->output)-1];
array_pop($this->output);

// Decrease level
$this->_level--;
}

function translate_entities($xmlSource, $reverse =FALSE) {
static $literal2NumericEntity;

if (empty($literal2NumericEntity)) {
$transTbl = get_html_translation_table(HTML_ENTITIES);

foreach ($transTbl as $char => $entity) {
if (strpos(’’&”<>’’, $char) !== FALSE) continue;
$literal2NumericEntity[$entity] = ’’&#’’.ord($char).’’;’’;
}
}

if ($reverse) {
return strtr($xmlSource, array_flip($literal2NumericEntity));
} else {
return strtr($xmlSource, $literal2NumericEntity);
}
}
}

// To be used like this
$parser = new HTML_Parser;
$output = $parser->parse($html);

print_r ($output);

?>