Simple screen scraping example with PhantomJS

Let's say you wanted to get all the links from an unordered list on a webpage.

There are several ways to do it but let's use PhantomJS.

PhantomJS is a headless browser. That is you don't actually see it. It will load a page and execute the page's Javascript and you can load libraries like jQuery to manipulate the DOM and get attributes of the selectors.

It is mostly used in testing but works well as a screen scraper too.

The page I am interested in has a nav-product id and an unordered list within. I want the link name and the href link.

<div id="nav-product">
  <ul>
    <li> 
      <a href="blahblah.html">
        category blah 
      </a>
    </li>
    ...
</div>

Let's walk through the code:
We use a javascript object with a function called execute. Create the webpage object from PhantomJS and open our URL of interest.

var category = {
  execute : function(){
  var page = require('webpage').create();
  page.open( 'http://example.com' , function(){

Next we include the jQuery library.

page.includeJs("http://ajax.googleapis.com/
ajax/libs/jquery/3.2.1/jquery.min.js",

Evaluate the page and use jQuery to iterate through the list and return what we need.

function() {
  var results =   page.evaluate(function() {
  var cats = []; //temporary array
   $( "#nav-product > ul:first li").each( 
  function(){

Push the results as a hash on to the temporary array.

cats.push( { cat_url: 
$(this).find('a').attr('href'), cat: $(this 
  ).text() });
  });
  return cats;
});

Then output to screen to pipe to a file.

for ( var x = 0; x < results.length; x++ ){
  console.log( results[x].cat_url +","+ 
  results[x].cat );
}
    phantom.exit();
    });
   });
  }
}

Call the execute function.

category.execute();

Here is the script in it's entirety:

var category = {
  execute : function(){
  var page = require('webpage').create();
  page.open( 'http://www.example.com' , 
    function() {
    page.includeJs("http://ajax.googleapis.com/
    ajax/libs/jquery/3.2.1/jquery.min.js",
   function() {
     var results =   page.evaluate(function() {
     var cats = [];
   $( "#nav-product > ul:first li").each( 
     function(){
     cats.push( { cat_url: 
    $(this).find('a').attr('href'), cat: $(this 
    ).text() });
    });
   return cats;
  });
 for ( var x = 0; x < results.length; x++ ){
   console.log( results[x].cat_url +","+ 
   results[x].cat );
   }
    phantom.exit();
   });
  });
 }
}
category.execute();

Run the script from the command line like so:

phantomjs --debug=false --cookies-file=cookie.txt --ignore-ssl-errors=true example.js > out.txt

Set --debug to true to a verbose explanation of what is happening.

This is a very simple script that works but if you wanted to make it more complicated, you should add some error handling. There is an onError function you should use if making anything more complicated.
It gets hard to debug and without the error handling, if there is a Javascript error the script just hangs.

I will show use of error handling in future tutorials.