Extract A tag Regular Expression Pattern
(?i
)<a([^>]+)>(.+
?)</a
>
Description
(#start of group #
1
?i # all checking are
caseinsensive
)#end of group #
1
<a #start with
"<a"
(# start of group #
2
[^>]+# anything except
(">"),at least one character
)# end of group #
2
># follow by
">"
(.+
?)# match anything
</a
># end with "
</a
>
Copy
\s
*#can start with whitespace
(?i
)# all checking are
caseinsensive
href # follow by
"href"word
\s
*=\s
*# allows spaces on either side of the equal sign
,
(# start of group #
1
"([^"]*") # allow string with double quotes enclosed - "string"
|#
..or
'[^']*' # allow string with single quotes enclosed - 'string'
|#
..or
([^'">]+) # can't contains one single quotes
,doublequotes
">"
)
# end of group #
1
Code:
package com.codewr.javacore.regular.expression;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author codewr
*/
public class HTMLLinkExtractor {
private Pattern patternTag, patternLink;
private Matcher matcherTag, matcherLink;
private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
private static final String HTML_A_HREF_TAG_PATTERN
= "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
public HTMLLinkExtractor() {
patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
}
/**
* Validate html with regular expression
*
* @param html html content for validation
* @return Vector links and link text
*/
public Vector<HtmlLink> grabHTMLLinks(final String html) {
Vector<HtmlLink> result = new Vector<HtmlLink>();
matcherTag = patternTag.matcher(html);
while (matcherTag.find()) {
String href = matcherTag.group(1); // href
String linkText = matcherTag.group(2); // link text
matcherLink = patternLink.matcher(href);
while (matcherLink.find()) {
String link = matcherLink.group(1); // link
HtmlLink obj = new HtmlLink();
obj.setLink(link);
obj.setLinkText(linkText);
result.add(obj);
}
}
return result;
}
public static void main(String[] args) {
String html = "<a href='http://www.google.com'>google</a>";
HTMLLinkExtractor hTMLLinkExtractor = new HTMLLinkExtractor();
Vector<HtmlLink> htmlLinks = hTMLLinkExtractor.grabHTMLLinks(html);
for (int i = 0; i < htmlLinks.size(); i++) {
HtmlLink link = htmlLinks.get(i);
System.out.println(link.getLink());
System.out.println(link.getLinkText());
}
}
}
Class HtmlLink:
package com.codewr.javacore.regular.expression;
/**
*
* @author codewr
*/
public class HtmlLink {
String link;
String linkText;
HtmlLink() {
}
@Override
public String toString() {
return new StringBuffer("Link : ").append(this.link)
.append(" Link Text : ").append(this.linkText).toString();
}
public String getLink() {
return link;
}
public void setLink(String link) {
this.link = replaceInvalidChar(link);
}
public String getLinkText() {
return linkText;
}
public void setLinkText(String linkText) {
this.linkText = linkText;
}
private String replaceInvalidChar(String link) {
link = link.replaceAll("'", "");
link = link.replaceAll("\"", "");
return link;
}
}
Code UnitTest:
package com.codewr.javacore.regular.expression;
import com.codewr.javacore.regular.expression.HTMLLinkExtractor.HtmlLink;
import java.util.Vector;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* HTML link extrator Testing
*
* @author codewr
*
*/
public class TestHTMLLinkExtractor {
private HTMLLinkExtractor htmlLinkExtractor;
String TEST_LINK = "http://www.google.com";
@BeforeClass
public void initData() {
htmlLinkExtractor = new HTMLLinkExtractor();
}
@DataProvider
public Object[][] HTMLContentProvider() {
return new Object[][] {
new Object[] { " <a href='" + TEST_LINK + "'>google</a>" },
new Object[] { " <a HREF='" + TEST_LINK + "'>google</a>" },
new Object[] { " <A HREF='" + TEST_LINK + "'>google</A> , "
+ " <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },
new Object[] {"<A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },
new Object[] {"<A target='_blank' HREF='" + TEST_LINK + "'>google</A>" },
new Object[]{"<A target='_blank' HREF=\"" + TEST_LINK + "\">google</A>" },
new Object[] { "<a HREF=" + TEST_LINK + ">google</a>" }, };
}
@Test(dataProvider = "HTMLContentProvider")
public void ValidHTMLLinkTest(String html) {
Vector<HtmlLink> links = htmlLinkExtractor.grabHTMLLinks(html);
//there must have something
Assert.assertTrue(links.size() != 0);
for (int i = 0; i < links.size(); i++) {
HtmlLink htmlLinks = links.get(i);
System.out.println(htmlLinks);
Assert.assertEquals(htmlLinks.getLink(), TEST_LINK);
}
}
}
Output:
Running TestHTMLLinkExtractor
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Link : http://www.google.com Link Text : google
Tests run: 7, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.483 sec
Results :
Tests run: 7, Failures: 0, Errors: 0, Skipped: 0
Note: file pom.xml add maven
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>