import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class WebContent { /** * 读取一个网页全部内容 */ public String getOneHtml(final String htmlurl) throws IOException { URL url; String temp; final StringBuffer sb = new StringBuffer(); try { url = new URL(htmlurl); final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 读取网页全部内容 while (!(temp = in.readLine()).equals("")) { sb.append(temp); } in.close(); } catch (final MalformedURLException me) { System.out.println("你输入的URL格式有问题!请仔细输入"); me.getMessage(); throw me; } catch (final IOException e) { e.printStackTrace(); throw e; } return sb.toString(); } /** * * @param s * @return 获得网页标题 */ public String getTitle(final String s) { String regex; String title = ""; final List<String> list = new ArrayList<String>(); regex = "<title>.*?</title>"; final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } for (int i = 0; i < list.size(); i++) { title = title + list.get(i); } return outTag(title); } /** * * @param s * @return 去掉标记 */ public String outTag(final String s) { return s.replaceAll("<.*?>", ""); } /** * * @param s * @return 获取文章标题 */ public HashMap<String, String> getTitleFromUrl(final String s) { final HashMap<String, String> hm = new HashMap<String, String>(); final StringBuffer sb = new StringBuffer(); String html = ""; System.out.println("\n------------------开始读取网页(" + s + ")--------------------"); try { html = getOneHtml(s); } catch (final Exception e) { e.getMessage(); } System.out.println("------------------读取网页(" + s + ")结束--------------------\n"); System.out.println("------------------分析(" + s + ")结果如下--------------------\n"); String title = outTag(getTitle(html)); hm.put("title", title); return hm; } /** * @param args * 测试 */ public static void main(final String args[]) { String url = "http://www.php2.cc/article-2682-1.html"; final WebContent wc = new WebContent(); HashMap<String, String> hm = new HashMap<String, String>(); hm = wc.getTitleFromUrl(url); System.out.println("标题: " + hm.get("title")); } } PHP技术交流QQ群:422137578 除非注明,文章均为 PHP二次开发 原创,转载请注明本文地址:http://www.php2.cc/article-2683-1.html |