TECHNOLOGY
Bake an HTML Screen Scraper
Scraper is an updated version of Antonio Mota Rodrigues's open source Scraper class. The class is given a web address and a tag pattern (ie. where the data you want to scrape is located in the html) and it returns an array holding the requested data objects from that webpage.
Step 1: Create the vendor class
Filename: /app/vendors/Scraper/screen_scraper.php
/**
* Project: XHTML Screen Scraper PHP Class version 0.3.1<br />
* File: screen_scraper.php<br />
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.<br /><br />
*
* Keywords: html scraper, html screen scraper, html to array, convert html tables into arrays
* extract data from web pages, get html page as array, scraping, scrapers, xhtml to array, xhtml2array<br /><br />
* @copyright 2004 Antonio Mota Rodrigues. 2009 Updated by Debugged Interactive Designs.
* @author 2004 antoniorodrigues_at_omnisinal.com. 2009 debuggeddesigns.com.
* @version 0.3.1 (February 11, 2009)
* @package Scraper
*/
/**
ChangeLog
0.3.1
- Class updated by Debugged Interactive Designs.
- Added structured comments, and renamed class and package.
- Added function needed to trim whitespace from tag pattern objects.
- Removed print statements from vendor file
0.3
- Class exists on phpclasses.org copyright Antonio Mota Rodrigues - antoniorodrigues_at_omnisinal.com
*/
class scraper {
/**
* Remove whitespace from the element of an array.
*
* @param &$value The array pointer to be trimmed
* @return $value The trimmed value
*/
function trim_array(&$value){ $value = trim($value); }
/**
* Scrape the html source code of a given web address.
* Return the scraped code as a single string object.
*
* @param $s_url The web address
* @param $s_user_agent The browser id
* @return $s_html The html of the given web address
*/
function browse($s_url, $s_user_agent) {
//scrape the source code
$o_ch = curl_init();
curl_setopt ($o_ch, CURLOPT_URL, $s_url);
curl_setopt ($o_ch, CURLOPT_USERAGENT, $s_user_agent);
curl_setopt ($o_ch, CURLOPT_HEADER, 0);
curl_setopt ($o_ch, CURLOPT_RETURNTRANSFER, 1);
$s_html = curl_exec ($o_ch);
curl_close ($o_ch);
unset($o_ch);
//extract html from source code
for ($ascii = 0; $ascii <= 9; $ascii++) $s_html = str_replace(chr($ascii), "", $s_html);
for ($ascii = 11; $ascii < 32; $ascii++) $s_html = str_replace(chr($ascii), "", $s_html);
for ($ascii = 127; $ascii <= 255; $ascii++) $s_html = str_replace(chr($ascii), "", $s_html);
//return the scraped html
return $s_html;
}
/**
* Extract data within scraped html using the given tag pattern.
* Return an array holding each scraped data object.
*
* @param $s_html The html source code
* @param $s_start_pattern The term found before the pattern begins
* @param $s_end_pattern The term found after the pattern ends
* @param $s_model The tag pattern
*/
function extract ($s_html, $s_start_pattern, $s_end_pattern, $s_model) {
$a_result = array();
//cut the first block
$i_pos = strpos($s_html, $s_start_pattern);
$s_html = substr($s_html, $i_pos);
//cut last block
$i_pos = strpos($s_html, $s_end_pattern);
$s_html = substr($s_html, 0, $i_pos);
//prepare given tag pattern
$s_model = strtolower($s_model);
$a_model = explode ("\n", $s_model);
array_walk($a_model,array('scraper','trim_array'));
$i_model = count($a_model);
if (!$a_model[$i_model - 1]) unset($a_model[$i_model - 1]);
$a_html = explode ("<", $s_html);
$i_cnt = count($a_html);
//extract data within tags
for ($f = 0; $f < $i_cnt; $f++) {
$tag = "<" . $a_html[$f];
$closepos = strpos ($tag, ">");
$value = substr($tag, $closepos + 1, strlen($tag) - $closepos);
$tag = substr($tag,0,strlen($tag) - strlen($value));
$a_html[$f] = strtolower($tag);
$dat[$f] = $value;
}
$pat = 0;
$a_pat = array();
for ($f=0; $f < $i_cnt; $f++) {
if (strcmp(trim($a_model[$pat]),"<field>") == 0) {
//get data
$value = $dat[$f-1];
$value = str_replace ("\t", "", $value);
$value = str_replace ("\n", "", $value);
$value = str_replace ("\r", "", $value);
$value = trim ($value);
if (!$value) {$value = "{e}";}
array_push($a_pat,$dat[$f-1]);
$pat++;
$f--;
} else {
//check pattern
if (substr($a_model[$pat],0,1) == "<") {
$result = strpos (" " . trim($a_html[$f]), trim($a_model[$pat]),0);
} else {
$result = strpos (" " . strtolower($dat[$f]), trim($a_model[$pat]),0);
}
if (is_integer($result)) { $pat++; }
}
if ($pat == count($a_model)-1) {
$pat = 0;
if (count($a_pat)) {array_push($a_result, $a_pat);}
$a_pat = array();
}
}
//return an array holding each scraper object
return $a_result;
}
}
Step 2: Create a Test model
Filename: /app/models/test.php
class Test extends AppModel {
var $useTable = false;
}
Step 3: Use the Scraper inside a controller
Filename: /app/controllers/tests_controller.php
class TestsController extends AppController {
var $name = 'Tests';
function scraper() {
//vendor('Scraper'.DS.'screen_scraper'); //use this with the 1.1 core
App::import('Vendor','scraper' ,array('file'=>'Scraper'.DS.'screen_scraper.php'));
$scraper = new scraper();
//dont use a layout when printing results
$this->autoLayout = false;
//the web address to be scraped
$s_url = 'http://www.debuggeddesigns.com/open-source-projects/scraper/scrape-me';
//the browser id used for scraping
$s_user_agent = 'Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.0.1) Gecko/20020921 Netscape/7.0';
//scrape the html
$s_html = $scraper->browse($s_url, $s_user_agent);
//the term found before the tag pattern begins
$s_start_pattern = "Information";
//the term found after the tag pattern ends
$s_end_pattern = "First text below table";
//the pattern structure
$s_model = '<tr
<td
<a
<field>
</a>
</td>
<td
<field>
</td>
<td
<field>
</td>
/tr>';
//extract the data from the scraped html
$a_result = $scraper->extract($s_html, $s_start_pattern, $s_end_pattern, $s_model);
//pass objects to display
$this->set('scraped_objects', $a_result);
}
}
Step 4: Create a test view
Filename: /app/views/tests/scraper.thtml
<h3>Scraped content:</h3><br />
<?php
foreach($scraped_objects as $object_info):
echo $object_info[0].' - '.$object_info[1].' - '.$object_info[2].'<br />';
endforeach;
?>