How to extract HTML Links with regular expression in java

Extract A tag Regular Expression Pattern

(?i)<a([^>]+)>(.+?)</a>

Description

(                #start of group #1
?i               #  all checking are case insensive
)                #end of group #1
<a              #start with "<a"
(                #  start of group #2
[^>]+    #     anything except (">"), at least one character
)                #  end of group #2
>                #     follow by ">"
(.+?)    #        match anything 
</a>     #          end with "</a>
Copy
\s*                          #can start with whitespace
(?i)                         # all checking are case insensive
     href                    #  follow by "href" word
        \s*=\s*              #   allows spaces on either side of the equal sign,
(                   #    start of group #1
"([^"]*")   #      allow string with double quotes enclosed - "string"
|           #     ..or
'[^']*'     #        allow string with single quotes enclosed - 'string'
|           #    ..or
([^'">]+)   #      can't contains one single quotes,double quotes ">"
         )                   #    end of group #1

Code:

package com.codewr.javacore.regular.expression;

 

import java.util.Vector;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

 *

 * @author codewr

 */

public class HTMLLinkExtractor {

 

    private Pattern patternTag, patternLink;

    private Matcher matcherTag, matcherLink;

 

    private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";

    private static final String HTML_A_HREF_TAG_PATTERN

            = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";

 

    public HTMLLinkExtractor() {

        patternTag = Pattern.compile(HTML_A_TAG_PATTERN);

        patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);

    }

 

    /**

     * Validate html with regular expression

     *

     * @param html html content for validation

     * @return Vector links and link text

     */

    public Vector<HtmlLink> grabHTMLLinks(final String html) {

 

        Vector<HtmlLink> result = new Vector<HtmlLink>();

 

        matcherTag = patternTag.matcher(html);

 

        while (matcherTag.find()) {

 

            String href = matcherTag.group(1); // href

            String linkText = matcherTag.group(2); // link text

 

            matcherLink = patternLink.matcher(href);

 

            while (matcherLink.find()) {

 

                String link = matcherLink.group(1); // link

                HtmlLink obj = new HtmlLink();

                obj.setLink(link);

                obj.setLinkText(linkText);

 

                result.add(obj);

 

            }

 

        }

 

        return result;

 

    }

 

    public static void main(String[] args) {

        String html = "<a href='http://www.google.com'>google</a>";

        HTMLLinkExtractor hTMLLinkExtractor = new HTMLLinkExtractor();

        Vector<HtmlLink> htmlLinks = hTMLLinkExtractor.grabHTMLLinks(html);

        for (int i = 0; i < htmlLinks.size(); i++) {

            HtmlLink link = htmlLinks.get(i);

            System.out.println(link.getLink());

            System.out.println(link.getLinkText());

        }

    }

}

Class HtmlLink:

package com.codewr.javacore.regular.expression;

 

/**

 *

 * @author codewr

 */

public class HtmlLink {

 

    String link;

    String linkText;

 

    HtmlLink() {

    }

 

    @Override

    public String toString() {

        return new StringBuffer("Link : ").append(this.link)

                .append(" Link Text : ").append(this.linkText).toString();

    }

 

    public String getLink() {

        return link;

    }

 

    public void setLink(String link) {

        this.link = replaceInvalidChar(link);

    }

 

    public String getLinkText() {

        return linkText;

    }

 

    public void setLinkText(String linkText) {

        this.linkText = linkText;

    }

 

    private String replaceInvalidChar(String link) {

        link = link.replaceAll("'", "");

        link = link.replaceAll("\"", "");

        return link;

    }

 

}

Code UnitTest:

package com.codewr.javacore.regular.expression;

 

import com.codewr.javacore.regular.expression.HTMLLinkExtractor.HtmlLink;

import java.util.Vector;

 

import org.testng.Assert;

import org.testng.annotations.BeforeClass;

import org.testng.annotations.DataProvider;

import org.testng.annotations.Test;

 

 

/**

 * HTML link extrator Testing

 *

 * @author codewr

 *

 */

public class TestHTMLLinkExtractor {

 

       private HTMLLinkExtractor htmlLinkExtractor;

       String TEST_LINK = "http://www.google.com";

      

       @BeforeClass

       public void initData() {

              htmlLinkExtractor = new HTMLLinkExtractor();

       }

 

       @DataProvider

       public Object[][] HTMLContentProvider() {

         return new Object[][] {

           new Object[] { " <a href='" + TEST_LINK + "'>google</a>" },

           new Object[] { " <a HREF='" + TEST_LINK + "'>google</a>" },

                          

           new Object[] { " <A HREF='" + TEST_LINK + "'>google</A> , "

              + " <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },

                                        

           new Object[] {"<A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },

           new Object[] {"<A target='_blank' HREF='" + TEST_LINK + "'>google</A>" },

           new Object[]{"<A target='_blank' HREF=\"" + TEST_LINK + "\">google</A>" },

           new Object[] { "<a HREF=" + TEST_LINK + ">google</a>" }, };

       }

 

       @Test(dataProvider = "HTMLContentProvider")

       public void ValidHTMLLinkTest(String html) {

 

              Vector<HtmlLink> links = htmlLinkExtractor.grabHTMLLinks(html);

 

              //there must have something

              Assert.assertTrue(links.size() != 0);

 

              for (int i = 0; i < links.size(); i++) {

                     HtmlLink htmlLinks = links.get(i);

                     System.out.println(htmlLinks);

                     Assert.assertEquals(htmlLinks.getLink(), TEST_LINK);

              }

 

       }

}

Output:

Running TestHTMLLinkExtractor

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Link : http://www.google.com Link Text : google

Tests run: 7, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.483 sec

 

Results :

 

Tests run: 7, Failures: 0, Errors: 0, Skipped: 0

Note: file pom.xml add maven

<dependency>

<groupId>org.testng</groupId>

<artifactId>testng</artifactId>

<version>6.8</version>

<scope>test</scope>

</dependency>