HTML File
Create new html file name index.html as below:
<!DOCTYPE html>
<html>
<head>
<meta charset="ISO-8859-1">
<title>Extract HTML Tags with Regular Expression</title>
<link href="css/a.css" rel="stylesheet" type="text/css">
<link href="css/b.css" rel="stylesheet" type="text/css">
<script type="text/javascript" src="js/c.js"></script>
<script type="text/javascript" src="js/d.js"></script>
<script type="text/javascript" src="js/e.js"></script>
</head>
<body>
<p>Paragraph 1</p>
<p>Paragraph 2</p>
<img src="images/a.gif" width="120" height="100">
<img src="images/b.gif" width="120" height="100">
<a href="abc.com">Link 1</a>
<a href="def.net">Link 2</a>
</body>
</html>
Run Application
Create new java file named Main.java. This file use Regular Expression Extract HTML Tags as below:
package regular_expression;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
private static void extractWebPageTitle(String html) {
Pattern pattern = Pattern.compile("<title>(.*?)</title>");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
System.out.println(matcher.group(1));
}
}
private static void extractCss(String html) {
Pattern pattern = Pattern.compile("<link.*\\s+rel=\"stylesheet\"([^>]+)>");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
System.out.println(matcher.group(0));
System.out.println("========================================");
}
}
private static void extractJavascripts(String html) {
Pattern pattern = Pattern.compile("<script.*\\s+type=\"text/javascript\"([^>]+)></script>");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
System.out.println(matcher.group(0));
System.out.println("========================================");
}
}
private static void extractLinks(String html) {
Pattern pattern = Pattern.compile("<a href=\"(.*?)\">.+</a>");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
System.out.println(matcher.group(0));
System.out.println("==========================");
}
}
private static void extractImages(String html) {
Pattern pattern = Pattern.compile("<img src=\"(.*?)\">");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
System.out.println(matcher.group(0));
System.out.println("=========================================");
}
}
public static void main(String[] args) {
try {
String html = new String(Files.readAllBytes(Paths.get("src\\regular_expression\\index.html")), StandardCharsets.UTF_8);
System.out.println("Extract Web Page Title");
extractWebPageTitle(html);
System.out.println("\nExtract CSS Links");
extractCss(html);
System.out.println("\nExtract JavaScript Links");
extractJavascripts(html);
System.out.println("\nExtract HTML Links");
extractLinks(html);
System.out.println("\nExtract Images");
extractImages(html);
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
}
Output
Extract Web Page Title
Extract HTML Tags with Regular Expression
Extract CSS Links
<link href="css/a.css" rel="stylesheet" type="text/css">
========================================
<link href="css/b.css" rel="stylesheet" type="text/css">
========================================
Extract JavaScript Links
<script type="text/javascript" src="js/c.js"></script>
========================================
<script type="text/javascript" src="js/d.js"></script>
========================================
<script type="text/javascript" src="js/e.js"></script>
========================================
Extract HTML Links
<a href="abc.com">Link 1</a>
==========================
<a href="def.net">Link 2</a>
==========================
Extract Images
<img src="images/a.gif" width="120" height="100">
=========================================
<img src="images/b.gif" width="120" height="100">
=========================================