Thursday, June 6, 2013

Scrape image src's from a web page

import java.net.URLEncoder

String url = "http://msnbc.com"

String serviceUrl = "http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22${URLEncoder.encode(url)}%22%20and%20xpath%3D%22%2F%2Fimg%22"
//println serviceUrl

String resultXML = serviceUrl.toURL().text // YQL will return the HTML page as XML!
//println resultXML

def root = new XmlSlurper().parseText(resultXML)

List imgSrcs = root.results.img.@src as List
imgSrcs = imgSrcs*.toString().unique()
//println imgSrcs.join('\n')

No comments:

Post a Comment