8000 GitHub - mengjingji/web-data-extractor: web-data-extractor:Extract data with Jquery Selector, XPath or JsonPath from common web format. like HTML, XML, JSON.
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

web-data-extractor:Extract data with Jquery Selector, XPath or JsonPath from common web format. like HTML, XML, JSON.

License

Notifications You must be signed in to change notification settings

mengjingji/web-data-extractor

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

81 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

web-data-extractor

Maven Central Build Status codecov.io License

Extracting data with Jquery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.

Implements:

###Usage To add a dependency on Web-Data-Extractor using Maven, use the following:

<dependency>
    <groupId>im.nll.data</groupId>
    <artifactId>extractor</artifactId>
    <version>0.9.3</version>
</dependency>

To add a dependency using Gradle:

dependencies {
  compile 'im.nll.data:extractor:0.9.3'
}

##Examples

###extract single data

    String followers = Extractors.on(baseHtml).extract(new SelectorExtractor("div.followers")).with(new RegexExtractor("\\d+")).asString();

or use static method

    String followers = Extractors.on(baseHtml).extract(selector("div.followers")).with(regex("\\d+")).asString();

more method

 String year = Extractors.on("<div> Talk is cheap. Show me the code. - Fri, 25 Aug 2000 </div>")
                .extract(selector("div")) // extract with selector
                .filter(value -> value.trim()) // trim result
                .with(regex("20\\d{2}")) // get year with regex
                .filter(value -> "from " + value) // append 'form' string
                .asString();
        Assert.assertEquals("from 2000", year);

###extract data to map

    @Test
    public void testToMap() throws Exception {
        Map<String, String> dataMap = Extractors.on(baseHtml)
                .extract("title", selector("a.title"))
                .extract("followers", selector("div.followers")).with(regex("\\d+"))
                .extract("description", selector("div.description"))
                .asMap();
        Assert.assertEquals("fivesmallq", dataMap.get("title"));
        Assert.assertEquals("29671", dataMap.get("followers"));
        Assert.assertEquals("Talk is cheap. Show me the code.", dataMap.get("description"));
    }

###extract data to map list

    @Test
    public void testToMapList() throws Exception {
        //split param must implements ListableExtractor
        List<Map<String, String>> languages = Extractors.on(listHtml).split(selector("tr.item.html"))
                .extract("type", selector("td.type"))
                .extract("name", selector("td.name"))
                .extract("url", selector("td.url"))
                .asMapList();
        Assert.assertNotNull(languages);
        Map<String, String> second = languages.get(1);
        Assert.assertEquals(languages.size(), 3);
        Assert.assertEquals(second.get("type"), "dynamic");
        Assert.assertEquals(second.get("name"), "Ruby");
        Assert.assertEquals(second.get("url"), "https://www.ruby-lang.org");
    }

###extract data to bean

    @Test
    public void testToBean() throws Exception {
        Base base = Extractors.on(baseHtml)
                .extract("title", selector("a.title"))
                .extract("followers", selector("div.followers")).with(regex("\\d+"))
                .extract("description", selector("div.description"))
                .asBean(Base.class);
        Assert.assertEquals("fivesmallq", base.getTitle());
        Assert.assertEquals("29671", base.getFollowers());
        Assert.assertEquals("Talk is cheap. Show me the code.", base.getDescription());
    }

###extract data to bean list

    @Test
    public void testToBeanList() throws Exception {
        List<Language> languages = Extractors.on(listHtml).split(selector("tr.item.html"))
                .extract("type", selector("td.type"))
                .extract("name", selector("td.name"))
                .extract("url", selector("td.url"))
                .asBeanList(Language.class);
        Assert.assertNotNull(languages);
        Language second = languages.get(1);
        Assert.assertEquals(languages.size(), 3);
        Assert.assertEquals(second.getType(), "dynamic");
        Assert.assertEquals(second.getName(), "Ruby");
        Assert.assertEquals(second.getUrl(), "https://www.ruby-lang.org");
    }

see Example

About

web-data-extractor:Extract data with Jquery Selector, XPath or JsonPath from common web format. like HTML, XML, JSON.

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages

  • Java 99.9%
  • HTML 0.1%
0