Monday, February 13, 2012

Simple Website Crawler with Selenium and Python

Have you ever wanted to grab all the links on your site and run some sort of test on them?


I've run into an issue on the web application that I'm testing, where periodically due to programmer error, a variable name will leak from the back-end into a URL that is presented to the user.  These URL's not only go nowhere, they can cause invalid HTML to be produced.  Already a Selenium user, I decided to see how hard it would be to use it to write a website crawler that could test all my links.

After a few hours of thinking -> coding -> testing, this is what I came up with:

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait 
import time

sut="http://yoursite.com"
to_be_scraped=[]
scraped=[]

#list of wildcards to ignore in URL pattern.
ignored = ["ignore_me", "not_important_url_prefix"] 
log = open('crawler_log.txt', 'w')

#I require a specific firefox profile, you may not.
firefox_profile = "/home/brma/.mozilla/firefox/bxkc1pq1.Selenium" 

def uniqify(seq):
  # uniqifies a list.  Not order preserving
  keys = {}
  for e in seq:
    keys[e] = 1
  return keys.keys()

def getlinks(site):
  print "Testing %s\n" % (site)
  driver.get(site)
  print "  driver.get successful\n"
  elements = driver.find_elements_by_xpath("//a")
  print "  driver.find_elements_by_xpath successful\n"
  links = []
  for link in elements:
    try:
      if str(link.get_attribute("href"))[0:4] == "http":
        links.append(str(link.get_attribute("href")))
    except StaleElementReferenceException:
      log.write("Stale element reference found!\n")
      log.flush()
  links = uniqify(links)
  return links

def testlinks(links):
  badlinks = []
  for link in links:
    # I am testing for the $ character.  You can add more tests here.
    if '$' in link:
      badlinks.append(link)
  return badlinks

def ignore(link):
  for pattern in ignored:
    if pattern in link:
      return True

def scraper(site):
  links = getlinks(site)
  scraped.append(site)
  badlinks = testlinks(links)
  if badlinks:
    for link in badlinks:
      print "* Bad link on \"%s\" detected: \"%s\"" % (site, link)
      log.write("* Bad link on \"%s\" detected: \"%s\"" % (site, link))
      links.remove(link)
  log.write('Done scraping %s\n' % (site))
  log.flush()
  return links

def crawler():
  while to_be_scraped:
    site = to_be_scraped.pop(0)
    links = scraper(site)
    for link in links:
      if not link[0:(len(sut))] == sut:
        print "%s not at sut" % (link)
        continue
      elif link in scraped:
        print "%s has already been scraped" % (link)
        continue
      elif link in to_be_scraped:
        print "%s is already on the queue" % (link)
        continue
      elif ignore(link):
        print "%s is being ignored" % (link)
        continue
      else:
        print "adding %s to the queue" % (link)
        to_be_scraped.append(link)

to_be_scraped.append(sut)
driver = webdriver.Firefox(webdriver.firefox.firefox_profile.FirefoxProfile(firefox_profile))
crawler()
log.close()


* Thanks to peterbe for the uniqify function.

This is the slimmed down version.  I have some extra, project specific, constraints on which URLs I add to the queue in getlinks().

This script sort of evolved one afternoon when I was just experimenting.  Please feel free to take this code and modify it for your own use.  If you have any comments or suggestions I'd be interested to hear them!