Tuesday, March 29, 2011

Scrape HTML

Note: This scrape no longer works correctly -- such are the dangers of screen scraping! ;)


@Grapes([
    @Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='1.2')
    ,@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.14')
    //,@Grab(group='org.htmlcleaner', module='htmlcleaner', version='2.2')
    ]
)

import org.ccil.cowan.tagsoup.* //TagSoup
import org.cyberneko.html.parsers.SAXParser //NekoHTML
import org.htmlcleaner.* //not in maven -- http://htmlcleaner.sourceforge.net/, http://dist.wso2.org/maven2/org/htmlcleaner/htmlcleaner/

def url = new URL("http://lifehacker.com")


println "*" * 15 + 'TAG SOUP' + "*" * 15
slurper = new XmlSlurper(new Parser())
url.withReader { reader ->
    html = slurper.parse(reader)
    println findLinks(html).join('\n')
}


println "*" * 15 + 'NekoHTML' + "*" * 15
def parser = new SAXParser()
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",true)
html = new XmlSlurper(parser).parseText(url.text)
println findLinks(html).join('\n')

/*
println "*" * 15 + 'HTMLCleaner' + "*" * 15
def cleaner = new HtmlCleaner()
def node = cleaner.clean(url)
def props = cleaner.getProperties()
def serializer = new SimpleXmlSerializer(props)
def xml = serializer.getXmlAsString(node)
def page = new XmlSlurper(false,false).parseText(xml)
println findLinks(page).join('\n')
*/


///  UTILS
Set findLinks(def root) {
    int cnt = 0
    def posts = root?.depthFirst()?.find { it.@id.text() == "splashPosts"}
    Set links = posts?.depthFirst()?.findAll { it.name().toLowerCase() == "a" && !it.@href.text().contains('/tag/') }.collect([]) { cnt++; it.@href.text() - '?skyline=true&s=i'} as Set
    println "found $cnt"
    return links.sort()
}

http://groovyconsole.appspot.com/script/448003

BONUS: CSS Selectors with JSoup

@Grapes( @Grab(group='org.jsoup', module='jsoup', version='1.6.1' ))
def doc = org.jsoup.Jsoup.connect("http://www.bing.com/search?q=web+scraping").get()
println 'start'
doc.select("#results h3 a").each { node ->
    println '-->' + node.text() + ' == ' + node.attr('href') 
}
println 'done'

BONUS: XPath Selectors with TagSoup

@Grapes( @Grab('org.ccil.cowan.tagsoup:tagsoup:1.2') ) 
import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.*;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.sax.*;
import javax.xml.xpath.*

def urlString = "http://www.bing.com/search?q=web+scraping"
URL url = new URL(urlString);

XMLReader reader = new Parser();
//Transform SAX to DOM
reader.setFeature(Parser.namespacesFeature, false);
reader.setFeature(Parser.namespacePrefixesFeature, false);
Transformer transformer = TransformerFactory.newInstance().newTransformer();
DOMResult result = new DOMResult();
transformer.transform(new SAXSource(reader, new InputSource(url.openStream())), result);

def xpath = XPathFactory.newInstance().newXPath()

//CSS selector: $('#results h3 a')
def results = xpath.evaluate( '//*[@id=\'results\']//h3/a', result.getNode(), XPathConstants.NODESET )

results.each { println it }
println 'done'

No comments:

Post a Comment