织梦CMS - 轻松建站从此开始!

罗索实验室

当前位置: 主页 > 行业动态 > 『互联网』 >

spider简单的爬虫程序

jackyhwei 发布于 2010-06-04 13:20 点击:次 
spider简单的爬虫程序
TAG:

spider简单的爬虫程序

1、基础准备
htmlparser
首页:http://sourceforge.net/projects/htmlparser/
下载:http://sourceforge.net/project/showfiles.php?group_id=24399
文件:htmlparser1_6_20060610.zip
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.6</version>
</dependency>

cpdetector
首页:http://cpdetector.sourceforge.net/
下载:http://sourceforge.net/project/showfiles.php?group_id=114421
文件:cpdetector_eclipse_project_1.0.7.zip

<dependency>
<groupId>cpdetector</groupId>
<artifactId>cpdetector</artifactId>
<version>1.0.5</version>
</dependency>

spindle
首页:http://www.bitmechanic.com/projects/spindle/ (但是已经无法访问)

2 修改spindle代码得到的spider
简单的将URL打印出来了,解析的内容等等都没有处理

解析HTML的基类HtmlParserUtil.java


package com.sillycat.api.commons.utils.html;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;

import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;

public class HtmlParserUtil {

/* StringBuffer的缓冲区大小 */
public static int TRANSFER_SIZE = 4096;

/* 当前平台的行分隔符 */
public static String lineSep = System.getProperty("line.separator");

/* 自动探测页面编码,避免中文乱码的出现 */
public static String autoDetectCharset(URL url) {

   CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
   /**
   * ParsingDetector可用于检查HTML、XML等文件或字符流的编码 构造方法中的参数用于指示是否显示探测过程的详细信息
   * 为false则不显示
   */
   detector.add(new ParsingDetector(false));
   detector.add(JChardetFacade.getInstance());
   detector.add(ASCIIDetector.getInstance());
   detector.add(UnicodeDetector.getInstance());

   Charset charset = null;
   try {
    charset = detector.detectCodepage(url);
   } catch (MalformedURLException mue) {
    mue.printStackTrace();
   } catch (IOException ie) {
    ie.printStackTrace();
   }
   if (charset == null)
    charset = Charset.defaultCharset();
   return charset.name();
}

/* 按照指定编码解析标准的html页面,为建立索引做准备 */
public static String[] parseHtml(String url, String charset) {

   String result[] = null;
   String content = null;

   try {
    URL source = new URL(url);
    InputStream in = source.openStream();
    BufferedReader reader = new BufferedReader(new InputStreamReader(
      in, charset));
    String line = new String();
    StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
    while ((line = reader.readLine()) != null) {
     temp.append(line);
     temp.append(lineSep);
    }
    reader.close();
    in.close();
    content = temp.toString();
   } catch (UnsupportedEncodingException uee) {
    uee.printStackTrace();
   } catch (MalformedURLException mue) {
    System.err.println("Invalid URL : " + url);
   } catch (UnknownHostException uhe) {
    System.err.println("UnknowHost : " + url);
   } catch (SocketException se) {
    System.err.println("Socket Error : " + se.getMessage() + " " + url);
   } catch (SocketTimeoutException ste) {
    System.err.println("Socket Connection Time Out : " + url);
   } catch (FileNotFoundException fnfe) {
    System.err.println("broken link "
      + ((FileNotFoundException) fnfe.getCause()).getMessage()
      + " ignored");
   } catch (IOException ie) {
    ie.printStackTrace();
   }

   if (content != null) {
    Parser myParser = Parser.createParser(content, charset);
    HtmlPage visitor = new HtmlPage(myParser);
    try {
     myParser.visitAllNodesWith(visitor);
     String body = null;
     String title = "Untitled";
     if (visitor.getBody() != null) {
      NodeList nodelist = visitor.getBody();
      body = nodelist.asString().trim();
     }
     if (visitor.getTitle() != null){
      title = visitor.getTitle();
     }
     result = new String[] { body, title };
    } catch (ParserException pe) {
     pe.printStackTrace();
    }
   }
   return result;
}
}


多线程爬虫类    HtmlCaptureRunner.java

package com.sillycat.api.thread.runner;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.sillycat.api.commons.utils.StringUtil;
import com.sillycat.api.commons.utils.html.HtmlParserUtil;

public class HtmlCaptureRunner implements Runnable {

public Log logger = LogFactory.getLog(getClass());

/* 基准(初始)URL */
protected String baseURL = null;

private String contentPath = null;

/**
* 待解析的URL地址集合,所有新检测到的链接均存放于此; 解析时按照先入先出(First-In First-Out)法则线性取出
*/
protected ArrayList URLs = new ArrayList();

/* 已存储的URL地址集合,避免链接的重复抓取 */
protected HashSet indexedURLs = new HashSet();

protected Parser parser = new Parser();;

/* 程序运行线程数,默认2个线程 */
protected int threads = 2;

/* 解析页面时的字符编码 */
protected String charset;

/* 基准端口 */
protected int basePort;

/* 基准主机 */
protected String baseHost;

/* 是否存储,默认true */
protected boolean justDatabase = true;

/* 检测索引中是否存在当前URL信息,避免重复抓取 */
protected boolean isRepeatedCheck = false;

public HtmlCaptureRunner() {
   PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
   factory.registerTag(new LocalLinkTag());
   factory.registerTag(new LocalFrameTag());
   factory.registerTag(new LocalBaseHrefTag());
   parser.setNodeFactory(factory);
}

public void capture() {
   URLs.clear();
   URLs.add(getBaseURL());

   int responseCode = 0;
   String contentType = "";

   try {
    HttpURLConnection uc = (HttpURLConnection) new URL(baseURL)
      .openConnection();
    responseCode = uc.getResponseCode();
    contentType = uc.getContentType();
   } catch (MalformedURLException mue) {
    logger.error("Invalid URL : " + getBaseURL());
   } catch (UnknownHostException uhe) {
    logger.error("UnknowHost : " + getBaseURL());
   } catch (SocketException se) {
    logger.error("Socket Error : " + se.getMessage() + " "
      + getBaseURL());
   } catch (IOException ie) {
    logger.error("IOException : " + ie);
   }

   if (responseCode == HttpURLConnection.HTTP_OK
     && contentType.startsWith("text/html")) {
    try {
     charset = HtmlParserUtil.autoDetectCharset(new URL(baseURL));

     basePort = new URL(baseURL).getPort();
     baseHost = new URL(baseURL).getHost();
     if (charset.equals("windows-1252"))
      charset = "GBK";

     long start = System.currentTimeMillis();
     ArrayList threadList = new ArrayList();
     for (int i = 0; i < threads; i++) {
      Thread t = new Thread(this, "Spider Thread #" + (i + 1));
      t.start();
      threadList.add(t);
     }
     while (threadList.size() > 0) {
      Thread child = (Thread) threadList.remove(0);
      try {
       child.join();
      } catch (InterruptedException ie) {
       logger.error("InterruptedException : " + ie);
      }
     }
     // for (int i = 0; i < threads; i++) {
     // threadPool.getThreadPoolExcutor().execute(new
     // Thread(this,"Spider Thread #" + (i + 1)));
     // }
     long elapsed = System.currentTimeMillis() - start;
     logger.info("Finished in " + (elapsed / 1000) + " seconds");
     logger.info("The Count of the Links Captured is "
       + indexedURLs.size());
    } catch (MalformedURLException e) {
     e.printStackTrace();
    }
   }
}

public void run() {
   String url;
   while ((url = dequeueURL()) != null) {
    if (justDatabase) {
     process(url);
    }
   }
   threads--;
}

/**
* 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行
*/
protected void process(String url) {

   String result[];
   String content = null;
   String title = null;

   result = HtmlParserUtil.parseHtml(url, charset);
   content = result[0];
   title = result[1];

   if (content != null && content.trim().length() > 0) {
    // content
    System.out.println(url);
    // title
    // DateTools.timeToString(System.currentTimeMillis()
   }
}

/* 从URL队列mPages里取出单个的URL */
public synchronized String dequeueURL() {
   while (true)
    if (URLs.size() > 0) {
     String url = (String) URLs.remove(0);
     indexedURLs.add(url);
     if (isToBeCaptured(url)) {
      NodeList list;
      try {
       int bookmark = URLs.size();
       /* 获取页面所有节点 */
       parser.setURL(url);
       try {
        list = new NodeList();
        for (NodeIterator e = parser.elements(); e
          .hasMoreNodes();)
         list.add(e.nextNode());
       } catch (EncodingChangeException ece) {
        /* 解码出错的异常处理 */
        parser.reset();
        list = new NodeList();
        for (NodeIterator e = parser.elements(); e
          .hasMoreNodes();)
         list.add(e.nextNode());
       }
       /**
       * 依据 http://www.robotstxt.org/wc/meta-user.html 处理
       * Robots <META> tag
       */
       NodeList robots = list
         .extractAllNodesThatMatch(
           new AndFilter(new NodeClassFilter(
             MetaTag.class),
             new HasAttributeFilter("name",
               "robots")), true);
       if (0 != robots.size()) {
        MetaTag robot = (MetaTag) robots.elementAt(0);
        String content = robot.getAttribute("content")
          .toLowerCase();
        if ((-1 != content.indexOf("none"))
          || (-1 != content.indexOf("nofollow")))
         for (int i = bookmark; i < URLs.size(); i++)
          URLs.remove(i);
       }
      } catch (ParserException pe) {
       logger.error("ParserException : " + pe);
      }
      return url;
     }
    } else {
     threads--;
     if (threads > 0) {
      try {
       wait();
       threads++;
      } catch (InterruptedException ie) {
       logger.error("InterruptedException : " + ie);
      }
     } else {
      notifyAll();
      return null;
     }
    }
}

private boolean isHTML(String url) {
   if (!url.endsWith(".html")) {
    return false;
   }
   if (StringUtil.isNotBlank(contentPath)) {
    if (!url.startsWith(baseURL + "/" + contentPath)) {
     return false;
    }
   }
   return true;
}

/**
* 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain
*/
public boolean isToBeCaptured(String url) {

   boolean flag = false;

   HttpURLConnection uc = null;
   int responseCode = 0;
   String contentType = "";
   String host = "";
   int port = 0;

   try {
    URL source = new URL(url);
    String protocol = source.getProtocol();
    if (protocol != null && protocol.equals("http")) {
     host = source.getHost();
     port = source.getPort();
     uc = (HttpURLConnection) source.openConnection();
     uc.setConnectTimeout(8000);
     responseCode = uc.getResponseCode();
     contentType = uc.getContentType();
    }
   } catch (MalformedURLException mue) {
    logger.error("Invalid URL : " + url);
   } catch (UnknownHostException uhe) {
    logger.error("UnknowHost : " + url);
   } catch (SocketException se) {
    logger.error("Socket Error : " + se.getMessage() + " " + url);
   } catch (SocketTimeoutException ste) {
    logger.error("Socket Connection Time Out : " + url);
   } catch (FileNotFoundException fnfe) {
    logger.error("broken link " + url + " ignored");
   } catch (IOException ie) {
    logger.error("IOException : " + ie);
   }
   if (port == basePort
     && responseCode == HttpURLConnection.HTTP_OK
     && host.equals(baseHost)
     && (contentType.startsWith("text/html") || contentType
       .startsWith("text/plain")))
    flag = true;
   return flag;
}

class LocalLinkTag extends LinkTag {
   public void doSemanticAction() {
    String link = getLink();
    if (link.endsWith("/"))
     link = link.substring(0, link.length() - 1);
    int pos = link.indexOf("#");
    if (pos != -1)
     link = link.substring(0, pos);
    /* 将链接加入到处理队列中 */
    if (!(indexedURLs.contains(link) || URLs.contains(link))) {
     if (isHTML(link)) {
      URLs.add(link);
     }
    }
    setLink(link);
   }
}

/**
* Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
* targets if they match the source.
*/
class LocalFrameTag extends FrameTag {
   public void doSemanticAction() {
    String link = getFrameLocation();
    if (link.endsWith("/"))
     link = link.substring(0, link.length() - 1);
    int pos = link.indexOf("#");
    if (pos != -1)
     link = link.substring(0, pos);
    /* 将链接加入到处理队列中 */
    if (!(indexedURLs.contains(link) || URLs.contains(link))) {
     if (isHTML(link)) {
      URLs.add(link);
     }
    }
    setFrameLocation(link);
   }
}

/**
* Base tag that doesn't show. The toHtml() method is overridden to return
* an empty string, effectively shutting off the base reference.
*/
class LocalBaseHrefTag extends BaseHrefTag {
   public String toHtml() {
    return ("");
   }
}

public String getBaseURL() {
   return baseURL;
}

public void setBaseURL(String baseURL) {
   this.baseURL = baseURL;
}

public int getThreads() {
   return threads;
}

public void setThreads(int threads) {
   this.threads = threads;
}

public String getCharset() {
   return charset;
}

public void setCharset(String charset) {
   this.charset = charset;
}

public int getBasePort() {
   return basePort;
}

public void setBasePort(int basePort) {
   this.basePort = basePort;
}

public String getBaseHost() {
   return baseHost;
}

public void setBaseHost(String baseHost) {
   this.baseHost = baseHost;
}

public boolean isJustDatabase() {
   return justDatabase;
}

public void setJustDatabase(boolean justDatabase) {
   this.justDatabase = justDatabase;
}

public String getContentPath() {
   return contentPath;
}

public void setContentPath(String contentPath) {
   this.contentPath = contentPath;
}

}

spring上的配置文件applicationContext-bean.xml:
<bean id="productCapture"
   class="com.sillycat.api.thread.runner.HtmlCaptureRunner" >
   <property name="contentPath" value="${product.contentPath}" />
   <property name="basePort" value="${product.base.port}" />
   <property name="baseURL" value="${product.base.url}" />
   <property name="charset" value="${product.base.code}" />
   <property name="threads" value="${product.base.threads}"/>
</bean>

<bean id="messageCapture"
   class="com.sillycat.api.thread.runner.HtmlCaptureRunner" >
   <property name="contentPath" value="${message.contentPath}" />
   <property name="basePort" value="${message.base.port}" />
   <property name="baseURL" value="${message.base.url}" />
   <property name="charset" value="${message.base.code}" />
   <property name="threads" value="${message.base.threads}"/>
</bean>

easySearch.properties配置文件:
#==========================================
# spider configration
#=========================================
product.contentPath=product
product.base.port=80
product.base.url=http://www.safedv.com
product.base.code=UTF-8
product.base.threads=3

message.contentPath=message
message.base.port=80
message.base.url=http://www.safedv.com
message.base.code=UTF-8
message.base.threads=3

单元测试类HtmlRunnerTest.java文件:

package com.sillycat.api.thread;

import com.sillycat.api.commons.base.BaseManagerTest;
import com.sillycat.api.thread.runner.HtmlCaptureRunner;

public class HtmlRunnerTest extends BaseManagerTest {

private HtmlCaptureRunner productCapture;

private HtmlCaptureRunner messageCapture;

protected void setUp() throws Exception {
   super.setUp();
   productCapture = (HtmlCaptureRunner) appContext.getBean("productCapture");
   messageCapture = (HtmlCaptureRunner) appContext.getBean("messageCapture");
}

protected void tearDown() throws Exception {
   super.tearDown();
}

public void testDumy() {
   assertTrue(true);
}

public void ntestProductCapture() {
   productCapture.capture();
}

public void testMessageCapture(){
   messageCapture.capture();
}
}


本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/wyymaomi/archive/2008/12/03/3439066.aspx

一个简单的java网络爬虫(spider)

http://blog.csdn.net/wyymaomi/archive/2008/12/03/3439016.aspx

一个简单的java网络爬虫,由于时间原因,没有进一步解释.

需要的htmlparser.jar包到官方网上去下.

---------------------------------------------Spider.java-----------------------------------------------------------------


import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.htmlparser.RemarkNode;
import org.htmlparser.StringNode;
import org.htmlparser.Node;
import org.htmlparser.tags.*;
import org.htmlparser.Parser;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import java.util.Queue;
import java.util.LinkedList;

public class Spider implements Runnable {

boolean search_key_words = false;

int count = 0;

int limitsite = 10;

int countsite = 1;

String keyword = "中国";//搜索关键字

Parser parser = new Parser();

// List linklist = new ArrayList();
String startsite = "";//搜索的其实站点

SearchResultBean srb;//保存搜索结果

List resultlist = new ArrayList();//搜索到关键字链接列表

List searchedsite = new ArrayList();//已经被搜索站点列表

Queue linklist = new LinkedList();//需解析的链接列表

HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();

public Spider(String keyword, String startsite) {
   this.keyword = keyword;
   this.startsite = startsite;
   linklist.add(startsite);
   srb = new SearchResultBean();
}

public void run() {
   // TODO Auto-generated method stub
   search(linklist);

}

public void search(Queue queue) {
   String url = "";
     while(!queue.isEmpty()){
    url = queue.peek().toString();//查找列队
    try {
     if (!isSearched(searchedsite, url)) {
      if (isRobotAllowed(new URL(url)))//检查该链接是否被允许搜索
       processHtml(url);
      else
       System.out.println("this page is disallowed to search");
     }
    } catch (Exception ex) {

    }
    queue.remove();
  
     }
    
}
/**
* 解析HTML
* @param url 
* @throws ParserException
* @throws Exception
*/
public void processHtml(String url) throws ParserException, Exception {
   searchedsite.add(url);
   count = 0;
   System.out.println("searching ... :" + url);
   parser.setURL(url);
   parser.setEncoding("GBK");
   URLConnection uc = parser.getConnection();
   uc.connect();
   //uc.getLastModified();
   NodeIterator nit = parser.elements();
  
   while (nit.hasMoreNodes()) {
    Node node = nit.nextNode();
    parserNode(node);
   }
   srb.setKeywords(keyword);
   srb.setUrl(url);
   srb.setCount_key_words(count);
   resultlist.add(srb);
   System.out.println("count keywords is :" + count);
   System.out.println("----------------------------------------------");
}
/**
* 处理HTML标签
* @param tag
* @throws Exception
*/
public void dealTag(Tag tag) throws Exception {
   NodeList list = tag.getChildren();
   if (list != null) {
    NodeIterator it = list.elements();
    while (it.hasMoreNodes()) {
     Node node = it.nextNode();
     parserNode(node);
    }
   }
}
/**
* 处理HTML标签结点
* @param node
* @throws Exception
*/
    public void parserNode(Node node) throws Exception{
    if (node instanceof StringNode) {//判断是否是文本结点
    StringNode sNode = (StringNode) node;
    StringFilter sf = new StringFilter(keyword,false);
    search_key_words = sf.accept(sNode);
    if (search_key_words) {
     count++;
    }
    // System.out.println("text is :"+sNode.getText().trim());
   } else if (node instanceof Tag) {//判断是否是标签库结点
    Tag atag = (Tag) node;
    if (atag instanceof TitleTag) {//判断是否是标TITLE结点
     srb.setTitle(atag.getText());
    }
    if (atag instanceof LinkTag) {//判断是否是标LINK结点
     LinkTag linkatag = (LinkTag) atag;
     checkLink(linkatag.getLink(), linklist);
     // System.out.println("-----------------this is link --------------");
    }
    dealTag(atag);
   } else if (node instanceof RemarkNode) {//判断是否是注释
    // System.out.println("this is remark");
   }
    }
    /*
     * 检查链接是否需要加入列队
     */
public void checkLink(String link, Queue queue) {
   if (link != null && !link.equals("") && link.indexOf("#") == -1) {
    if (!link.startsWith("http://") && !link.startsWith("ftp://")
      && !link.startsWith("www.")) {
     link = "file:///" + link;
    } else if (link.startsWith("www.")) {
     link = "http://" + link;
    }
    if (queue.isEmpty())
     queue.add(link);
    else {
     String link_end_=link.endsWith("/")?link.substring(0,link.lastIndexOf("/")):(link+"/");
     if (!queue.contains(link)&&!queue .contains(link_end_)) {
      queue.add(link);
     }
    }
   }
}
/**
* 检查该链接是否已经被扫描
* @param list
* @param url
* @return
*/
public boolean isSearched(List list, String url) {
   String url_end_ = "";
   if (url.endsWith("/")) {
    url_end_ = url.substring(0, url.lastIndexOf("/"));
   } else {
    url_end_ = url + "/";
   }
   if (list.size() > 0) {
    if (list.indexOf(url) != -1 || list.indexOf(url_end_) != -1) {
     return true;
    }
   }
   return false;
}
/**
* 检查URL是否被允许搜索
* @param urlToCheck
* @return
*/
private boolean isRobotAllowed(URL urlToCheck) {
   String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
   // System.out.println("主机="+host);

   // 获取主机不允许搜索的URL缓存
   ArrayList<String> disallowList = disallowListCache.get(host);

   // 如果还没有缓存,下载并缓存。
   if (disallowList == null) {
    disallowList = new ArrayList<String>();
    try {
     URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
     BufferedReader reader = new BufferedReader(
       new InputStreamReader(robotsFileUrl.openStream()));

     // 读robot文件,创建不允许访问的路径列表。
     String line;
     while ((line = reader.readLine()) != null) {
      if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
       String disallowPath = line.substring("Disallow:"
         .length());// 获取不允许访问路径

       // 检查是否有注释。
       int commentIndex = disallowPath.indexOf("#");
       if (commentIndex != -1) {
        disallowPath = disallowPath.substring(0,
          commentIndex);// 去掉注释
       }

       disallowPath = disallowPath.trim();
       disallowList.add(disallowPath);
      }
     }
     for (Iterator it = disallowList.iterator(); it.hasNext();) {
      System.out.println("Disallow is :" + it.next());
     }
     // 缓存此主机不允许访问的路径。
     disallowListCache.put(host, disallowList);
    } catch (Exception e) {
     return true; // web站点根目录下没有robots.txt文件,返回真
    }
   }

   String file = urlToCheck.getFile();
   // System.out.println("文件getFile()="+file);
   for (int i = 0; i < disallowList.size(); i++) {
    String disallow = disallowList.get(i);
    if (file.startsWith(disallow)) {
     return false;
    }
   }

   return true;
}

public static void main(String[] args) {

   Spider ph = new Spider("英超", "http://www.microsoft.com");
   try {
    // ph.processHtml();
    Thread search = new Thread(ph);
    search.start();//启动线程
   } catch (Exception ex) {

   }

}
}

--------------------------------------SearchResultBean.java---------------------------------------------------------


public class SearchResultBean {
   String url = "";
   String title = "";
   String keywords = "";
   int count_key_words = 0;
public int getCount_key_words() {
return count_key_words;
}
public void setCount_key_words(int count_key_words) {
this.count_key_words = count_key_words;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}

(wyymaomi)
本站文章除注明转载外,均为本站原创或编译欢迎任何形式的转载,但请务必注明出处,尊重他人劳动,同学习共成长。转载请注明:文章转载自:罗索实验室 [http://www1.rosoo.net/a/201006/9576.html]
本文出处:CSDN博客 作者:wyymaomi
顶一下
(3)
100%
踩一下
(0)
0%
------分隔线----------------------------
发表评论
请自觉遵守互联网相关的政策法规,严禁发布色情、暴力、反动的言论。
评价:
表情:
用户名: 验证码:点击我更换图片
栏目列表
将本文分享到微信
织梦二维码生成器
推荐内容