# -------------------------------------------------- # Load the packages --------------------------------- require(lubridate) require(plyr) require(stringr) require(XML) require(RCurl) # Extracting a table -------------------------------- url='http://en.wikipedia.org/wiki/Elections_in_Russia' tables<-readHTMLTable(url) head(tables[[6]]) # Download html ------------------------------------- url <- "http://en.wikipedia.org/wiki/Boris_Nemtsov" raw <- getURL(url,encoding="UTF-8") #Download the page #this is a very very long line. Let's not print it. Instead: substring (raw,1,200) PARSED <- htmlParse(raw) #Format the html code d # Accessing HTML elements in R with XPath ----------- xpathSApply(PARSED, "//h1") # Extract content ----------------------------------- xpathSApply(PARSED, "//h1",xmlValue) # Untitled ----------------------------------------- HTML tags ====================== - \: starts html code - \ : contains meta data etc - \