Extract HTML Tags with Regular Expression in Java


Create new html file name index.html as below:

<!DOCTYPE html>
<html>
<head>
<meta charset="ISO-8859-1">
<title>Extract HTML Tags with Regular Expression</title>

<link href="css/a.css" rel="stylesheet" type="text/css">
<link href="css/b.css" rel="stylesheet" type="text/css">

<script type="text/javascript" src="js/c.js"></script>
<script type="text/javascript" src="js/d.js"></script>
<script type="text/javascript" src="js/e.js"></script>

</head>
<body>

	<p>Paragraph 1</p>
	<p>Paragraph 2</p>
	<img src="images/a.gif" width="120" height="100">
	<img src="images/b.gif" width="120" height="100">
	<a href="abc.com">Link 1</a>
	<a href="def.net">Link 2</a>

</body>
</html>




Create new java file named Main.java. This file use Regular Expression Extract HTML Tags as below:

package regular_expression;

import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {

	private static void extractWebPageTitle(String html) {
		Pattern pattern = Pattern.compile("<title>(.*?)</title>");
		Matcher matcher = pattern.matcher(html);
		while (matcher.find()) {
			System.out.println(matcher.group(1));
		}
	}

	private static void extractCss(String html) {
		Pattern pattern = Pattern.compile("<link.*\\s+rel=\"stylesheet\"([^>]+)>");
		Matcher matcher = pattern.matcher(html);
		while (matcher.find()) {
			System.out.println(matcher.group(0));
			System.out.println("========================================");
		}
	}

	private static void extractJavascripts(String html) {
		Pattern pattern = Pattern.compile("<script.*\\s+type=\"text/javascript\"([^>]+)></script>");
		Matcher matcher = pattern.matcher(html);
		while (matcher.find()) {
			System.out.println(matcher.group(0));
			System.out.println("========================================");
		}
	}

	private static void extractLinks(String html) {
		Pattern pattern = Pattern.compile("<a href=\"(.*?)\">.+</a>");
		Matcher matcher = pattern.matcher(html);
		while (matcher.find()) {
			System.out.println(matcher.group(0));
			System.out.println("==========================");
		}
	}

	private static void extractImages(String html) {
		Pattern pattern = Pattern.compile("<img src=\"(.*?)\">");
		Matcher matcher = pattern.matcher(html);
		while (matcher.find()) {
			System.out.println(matcher.group(0));
			System.out.println("=========================================");
		}
	}

	public static void main(String[] args) {

		try {
			String html = new String(Files.readAllBytes(Paths.get("src\\regular_expression\\index.html")), StandardCharsets.UTF_8);

			System.out.println("Extract Web Page Title");
			extractWebPageTitle(html);

			System.out.println("\nExtract CSS Links");
			extractCss(html);

			System.out.println("\nExtract JavaScript Links");
			extractJavascripts(html);

			System.out.println("\nExtract HTML Links");
			extractLinks(html);

			System.out.println("\nExtract Images");
			extractImages(html);
		} catch (Exception e) {
			System.err.println(e.getMessage());
		}

	}

}




Extract Web Page Title
Extract HTML Tags with Regular Expression

Extract CSS Links
<link href="css/a.css" rel="stylesheet" type="text/css">
========================================
<link href="css/b.css" rel="stylesheet" type="text/css">
========================================

Extract JavaScript Links
<script type="text/javascript" src="js/c.js"></script>
========================================
<script type="text/javascript" src="js/d.js"></script>
========================================
<script type="text/javascript" src="js/e.js"></script>
========================================

Extract HTML Links
<a href="abc.com">Link 1</a>
==========================
<a href="def.net">Link 2</a>
==========================

Extract Images
<img src="images/a.gif" width="120" height="100">
=========================================
<img src="images/b.gif" width="120" height="100">
=========================================