|
JMeter example source code file (RegexpHTMLParser.java)
The JMeter RegexpHTMLParser.java source code/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.jmeter.protocol.http.parser; import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; import org.apache.jmeter.protocol.http.util.ConversionUtils; import org.apache.jmeter.util.JMeterUtils; import org.apache.jorphan.logging.LoggingManager; import org.apache.log.Logger; // NOTE: Also looked at using Java 1.4 regexp instead of ORO. The change was // trivial. Performance did not improve -- at least not significantly. // Finally decided for ORO following advise from Stefan Bodewig (message // to jmeter-dev dated 25 Nov 2003 8:52 CET) [Jordi] import org.apache.oro.text.regex.MatchResult; import org.apache.oro.text.regex.Pattern; import org.apache.oro.text.regex.PatternMatcherInput; import org.apache.oro.text.regex.Perl5Compiler; import org.apache.oro.text.regex.Perl5Matcher; /** * HtmlParser implementation using regular expressions. * <p> * This class will find RLs specified in the following ways (where <b>url * represents the RL being found: * <ul> * <li><img src=url ... > * <li><script src=url ... > * <li><applet code=url ... > * <li><input type=image src=url ... > * <li><body background=url ... > * <li><table background=url ... > * <li><td background=url ... > * <li><tr background=url ... > * <li><applet ... codebase=url ... > * <li><embed src=url ... > * <li><embed codebase=url ... > * <li><object codebase=url ... > * <li><link rel=stylesheet href=url... gt; * <li><bgsound src=url ... > * <li><frame src=url ... > * </ul> * * <p> * This class will take into account the following construct: * <ul> * <li><base href=url> * </ul> * * <p> * But not the following: * <ul> * <li>< ... codebase=url ... > * </ul> * */ class RegexpHTMLParser extends HTMLParser { private static final Logger log = LoggingManager.getLoggerForClass(); /** * Regexp fragment matching a tag attribute's value (including the equals * sign and any spaces before it). Note it matches unquoted values, which to * my understanding, are not conformant to any of the HTML specifications, * but are still quite common in the web and all browsers seem to understand * them. */ private static final String VALUE = "\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'\\s>\\\\][^\\s>]*)(?=[\\s>]))"; // Note there's 3 capturing groups per value /** * Regexp fragment matching the separation between two tag attributes. */ private static final String SEP = "\\s(?:[^>]*\\s)?"; /** * Regular expression used against the HTML code to find the URIs of images, * etc.: */ private static final String REGEXP = "<(?:" + "!--.*?-->" + "|BASE" + SEP + "HREF" + VALUE + "|(?:IMG|SCRIPT|FRAME|IFRAME|BGSOUND|FRAME)" + SEP + "SRC" + VALUE + "|APPLET" + SEP + "CODE(?:BASE)?" + VALUE + "|(?:EMBED|OBJECT)" + SEP + "(?:SRC|CODEBASE)" + VALUE + "|(?:BODY|TABLE|TR|TD)" + SEP + "BACKGROUND" + VALUE + "|[^<]+?STYLE\\s*=['\"].*?URL\\(\\s*['\"](.+?)['\"]\\s*\\)" + "|INPUT(?:" + SEP + "(?:SRC" + VALUE + "|TYPE\\s*=\\s*(?:\"image\"|'image'|image(?=[\\s>])))){2,}" + "|LINK(?:" + SEP + "(?:HREF" + VALUE + "|REL\\s*=\\s*(?:\"stylesheet\"|'stylesheet'|stylesheet(?=[\\s>])))){2,}" + ")"; // Number of capturing groups possibly containing Base HREFs: private static final int NUM_BASE_GROUPS = 3; /** * Thread-local input: */ private static final ThreadLocal<PatternMatcherInput> localInput = new ThreadLocal<PatternMatcherInput>() { @Override protected PatternMatcherInput initialValue() { return new PatternMatcherInput(new char[0]); } }; /** * {@inheritDoc} */ @Override protected boolean isReusable() { return true; } /** * Make sure to compile the regular expression upon instantiation: */ protected RegexpHTMLParser() { super(); } /** * {@inheritDoc} */ @Override public Iterator<URL> getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls) { Perl5Matcher matcher = JMeterUtils.getMatcher(); PatternMatcherInput input = localInput.get(); // TODO: find a way to avoid the cost of creating a String here -- // probably a new PatternMatcherInput working on a byte[] would do // better. input.setInput(new String(html)); // TODO - charset? Pattern pattern=JMeterUtils.getPatternCache().getPattern( REGEXP, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.SINGLELINE_MASK | Perl5Compiler.READ_ONLY_MASK); while (matcher.contains(input, pattern)) { MatchResult match = matcher.getMatch(); String s; if (log.isDebugEnabled()) { log.debug("match groups " + match.groups() + " " + match.toString()); } // Check for a BASE HREF: for (int g = 1; g <= NUM_BASE_GROUPS && g <= match.groups(); g++) { s = match.group(g); if (s != null) { if (log.isDebugEnabled()) { log.debug("new baseUrl: " + s + " - " + baseUrl.toString()); } try { baseUrl = ConversionUtils.makeRelativeURL(baseUrl, s); } catch (MalformedURLException e) { // Doesn't even look like a URL? // Maybe it isn't: Ignore the exception. if (log.isDebugEnabled()) { log.debug("Can't build base URL from RL " + s + " in page " + baseUrl, e); } } } } for (int g = NUM_BASE_GROUPS + 1; g <= match.groups(); g++) { s = match.group(g); if (s != null) { if (log.isDebugEnabled()) { log.debug("group " + g + " - " + match.group(g)); } urls.addURL(s, baseUrl); } } } return urls.iterator(); } } Other JMeter examples (source code examples)Here is a short list of links related to this JMeter RegexpHTMLParser.java source code file: |
... this post is sponsored by my books ... | |
![]() #1 New Release! |
![]() FP Best Seller |
Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.