Extracting data with Jquery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.
Implements:
###Usage To add a dependency on Web-Data-Extractor using Maven, use the following:
<dependency>
<groupId>im.nll.data</groupId>
<artifactId>extractor</artifactId>
<version>0.9.3</version>
</dependency>
To add a dependency using Gradle:
dependencies {
compile 'im.nll.data:extractor:0.9.3'
}
##Examples
###extract single data
String followers = Extractors.on(baseHtml).extract(new SelectorExtractor("div.followers")).with(new RegexExtractor("\\d+")).asString();
or use static method
String followers = Extractors.on(baseHtml).extract(selector("div.followers")).with(regex("\\d+")).asString();
more method
String year = Extractors.on("<div> Talk is cheap. Show me the code. - Fri, 25 Aug 2000 </div>")
.extract(selector("div")) // extract with selector
.filter(value -> value.trim()) // trim result
.with(regex("20\\d{2}")) // get year with regex
.filter(value -> "from " + value) // append 'form' string
.asString();
Assert.assertEquals("from 2000", year);
###extract data to map
@Test
public void testToMap() throws Exception {
Map<String, String> dataMap = Extractors.on(baseHtml)
.extract("title", selector("a.title"))
.extract("followers", selector("div.followers")).with(regex("\\d+"))
.extract("description", selector("div.description"))
.asMap();
Assert.assertEquals("fivesmallq", dataMap.get("title"));
Assert.assertEquals("29671", dataMap.get("followers"));
Assert.assertEquals("Talk is cheap. Show me the code.", dataMap.get("description"));
}
###extract data to map list
@Test
public void testToMapList() throws Exception {
//split param must implements ListableExtractor
List<Map<String, String>> languages = Extractors.on(listHtml).split(selector("tr.item.html"))
.extract("type", selector("td.type"))
.extract("name", selector("td.name"))
.extract("url", selector("td.url"))
.asMapList();
Assert.assertNotNull(languages);
Map<String, String> second = languages.get(1);
Assert.assertEquals(languages.size(), 3);
Assert.assertEquals(second.get("type"), "dynamic");
Assert.assertEquals(second.get("name"), "Ruby");
Assert.assertEquals(second.get("url"), "https://www.ruby-lang.org");
}
###extract data to bean
@Test
public void testToBean() throws Exception {
Base base = Extractors.on(baseHtml)
.extract("title", selector("a.title"))
.extract("followers", selector("div.followers")).with(regex("\\d+"))
.extract("description", selector("div.description"))
.asBean(Base.class);
Assert.assertEquals("fivesmallq", base.getTitle());
Assert.assertEquals("29671", base.getFollowers());
Assert.assertEquals("Talk is cheap. Show me the code.", base.getDescription());
}
###extract data to bean list
@Test
public void testToBeanList() throws Exception {
List<Language> languages = Extractors.on(listHtml).split(selector("tr.item.html"))
.extract("type", selector("td.type"))
.extract("name", selector("td.name"))
.extract("url", selector("td.url"))
.asBeanList(Language.class);
Assert.assertNotNull(languages);
Language second = languages.get(1);
Assert.assertEquals(languages.size(), 3);
Assert.assertEquals(second.getType(), "dynamic");
Assert.assertEquals(second.getName(), "Ruby");
Assert.assertEquals(second.getUrl(), "https://www.ruby-lang.org");
}
see Example