TECHNOLOGY
Bake an HTML Screen Scraper
Scraper is an updated version of Antonio Mota Rodrigues's open source Scraper class. The class is given a web address and a tag pattern (ie. where the data you want to scrape is located in the html) and it returns an array holding the requested data objects from that webpage.
Step 1: Create the vendor class
The original Scraper class was written by Antonio Mota Rodrigues and can be found at http://www.phpclasses.org/browse/package/1754.html. We were forced to make a few modifications to the class in order to make it work with CakePHP.
Filename: /app/vendors/Scraper/screen_scraper.php

/**
  * Project:    XHTML Screen Scraper PHP Class version 0.3.1<br />
  * File:       screen_scraper.php<br />
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.<br /><br />
  * 
  * Keywords: html scraper, html screen scraper, html to array, convert html tables into arrays
  * extract data from web pages, get html page as array, scraping, scrapers, xhtml to array, xhtml2array<br /><br />
  
  * @copyright 2004 Antonio Mota Rodrigues.  2009 Updated by Debugged Interactive Designs.
  * @author 2004 antoniorodrigues_at_omnisinal.com. 2009 debuggeddesigns.com.
  * @version 0.3.1 (February 11, 2009)
  * @package Scraper
  */

/**
  ChangeLog

  0.3.1
  - Class updated by Debugged Interactive Designs.
  - Added structured comments, and renamed class and package.
  - Added function needed to trim whitespace from tag pattern objects.
  - Removed print statements from vendor file

  0.3
  - Class exists on phpclasses.org copyright Antonio Mota Rodrigues - antoniorodrigues_at_omnisinal.com
  
*/
class scraper {

    
/**
     * Remove whitespace from the element of an array.
     * 
     * @param &$value The array pointer to be trimmed
     * @return $value The trimmed value
     */
    
function trim_array(&$value){ $value trim($value); }
    
    
/**
     * Scrape the html source code of a given web address.
     * Return the scraped code as a single string object.
     * 
     * @param $s_url The web address
     * @param $s_user_agent The browser id
     * @return $s_html The html of the given web address
     */
    
function browse($s_url$s_user_agent) {
        
        
//scrape the source code
        
$o_ch curl_init();
        
curl_setopt ($o_chCURLOPT_URL$s_url);
        
curl_setopt ($o_chCURLOPT_USERAGENT$s_user_agent);
        
curl_setopt ($o_chCURLOPT_HEADER0);
        
curl_setopt ($o_chCURLOPT_RETURNTRANSFER1);
        
$s_html curl_exec ($o_ch);
        
curl_close ($o_ch);
        unset(
$o_ch);
        
        
//extract html from source code
        
for ($ascii 0$ascii <= 9$ascii++) $s_html str_replace(chr($ascii), ""$s_html);
        for (
$ascii 11$ascii 32$ascii++) $s_html str_replace(chr($ascii), ""$s_html);
        for (
$ascii 127$ascii <= 255$ascii++) $s_html str_replace(chr($ascii), ""$s_html);

        
//return the scraped html
        
return $s_html;
    }
    
    
/**
     * Extract data within scraped html using the given tag pattern.
     * Return an array holding each scraped data object.
     * 
     * @param $s_html The html source code
     * @param $s_start_pattern The term found before the pattern begins
     * @param $s_end_pattern The term found after the pattern ends
     * @param $s_model The tag pattern
     */
    
function extract ($s_html$s_start_pattern$s_end_pattern$s_model) {
        
        
$a_result = array();

        
//cut the first block
        
$i_pos strpos($s_html$s_start_pattern);
        
$s_html substr($s_html$i_pos);

        
//cut last block
        
$i_pos strpos($s_html$s_end_pattern);
        
$s_html substr($s_html0$i_pos);
        
        
//prepare given tag pattern
        
$s_model strtolower($s_model);
        
$a_model explode ("\n"$s_model);
        
array_walk($a_model,array('scraper','trim_array'));
        
$i_model count($a_model);
        if (!
$a_model[$i_model 1]) unset($a_model[$i_model 1]);
        
$a_html explode ("<"$s_html);
        
$i_cnt count($a_html);

        
//extract data within tags
        
for ($f 0$f $i_cnt$f++) {
            
$tag "<" $a_html[$f];
            
$closepos strpos ($tag">");
            
$value substr($tag$closepos 1strlen($tag) - $closepos);
            
$tag substr($tag,0,strlen($tag) - strlen($value));
            
$a_html[$f] = strtolower($tag);
            
$dat[$f] = $value;
        }
        
$pat 0;
        
$a_pat = array();
        for (
$f=0$f $i_cnt$f++) {
            if (
strcmp(trim($a_model[$pat]),"<field>") == 0) {
                
//get data
                
$value $dat[$f-1];
                
$value str_replace ("\t"""$value);
                
$value str_replace ("\n"""$value);
                
$value str_replace ("\r"""$value);
                
$value trim ($value);
                if (!
$value) {$value "{e}";}
                
array_push($a_pat,$dat[$f-1]);
                
$pat++;
                
$f--;
            } else {
                
//check pattern
                
if (substr($a_model[$pat],0,1) == "<") {
                    
$result strpos (" " trim($a_html[$f]), trim($a_model[$pat]),0);
                } else {
                    
$result strpos (" " strtolower($dat[$f]), trim($a_model[$pat]),0);
                }
                if (
is_integer($result)) {  $pat++; }
            }
            if (
$pat == count($a_model)-1) {
                
$pat 0;
                if (
count($a_pat)) {array_push($a_result$a_pat);}
                
$a_pat = array();
            }
        }
        
        
//return an array holding each scraper object
        
return $a_result;
    }
}

Step 2: Create a Test model
Filename: /app/models/test.php

class Test extends AppModel {
   var 
$useTable false;
}

Step 3: Use the Scraper inside a controller
Filename: /app/controllers/tests_controller.php

class TestsController extends AppController {
    var 
$name 'Tests';
    
    function 
scraper() {
        
//vendor('Scraper'.DS.'screen_scraper'); //use this with the 1.1 core
        
App::import('Vendor','scraper' ,array('file'=>'Scraper'.DS.'screen_scraper.php'));
        
$scraper = new scraper();
        
        
//dont use a layout when printing results
        
$this->autoLayout false;
        
//the web address to be scraped
        
$s_url 'http://www.debuggeddesigns.com/open-source-projects/scraper/scrape-me';
        
//the browser id used for scraping
        
$s_user_agent 'Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.0.1) Gecko/20020921 Netscape/7.0';
        
//scrape the html
        
$s_html $scraper->browse($s_url$s_user_agent);
        
        
//the term found before the tag pattern begins
        
$s_start_pattern "Information";
        
//the term found after the tag pattern ends
        
$s_end_pattern "First text below table";
        
//the pattern structure
        
$s_model '<tr
            <td
            <a
            <field>
            </a>
            </td>
            <td
            <field>
            </td>
            <td
            <field>
            </td>
            /tr>'
;
            
        
//extract the data from the scraped html
        
$a_result $scraper->extract($s_html$s_start_pattern$s_end_pattern$s_model);
        
        
//pass objects to display
        
$this->set('scraped_objects'$a_result);
    }
}

Step 4: Create a test view
Filename: /app/views/tests/scraper.thtml

<h3>Scraped content:</h3><br />
<?php 
foreach($scraped_objects as $object_info): 
    echo 
$object_info[0].' - '.$object_info[1].' - '.$object_info[2].'<br />';
endforeach; 

?>

COMMENTS (displaying 0 comments)

POST (leave a comment)

Name:
Email:
Message:
Verify:
CAPTCHA Image