Nutch index source code analysis (1)

Introduction to the indexing method of Nutch integrated slor

/**

   * build index

   * @param solrUrl solr's web address

   * @param crawlDb Crawling DB storage path: \crawl\crawldb

   * @param linkDb Crawling link storage path: \crawl\linkdb

   * @param segments metadata storage path: \crawl\segments

   * @param noCommit whether to submit the slor server and the slor index

   * @param deleteGone whether to delete outdated documents

   * @param solrParams parameters of solr

   * @param filter whether to enable URL filtering

   * @param normalize whether to format the URL

   * @throws IOException

   */

  public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,

      List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams,

      boolean filter, boolean normalize) throws IOException {

        ...

       IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

        ...

   }

The indexing of Nutch is done through a MR.

The input of the map is the sequenceFile in the directory that Nutch crawled, the key is the URL that Nutch crawled, and the java generic type used by the value abstracts all the data types customized by Nutch into a NutchWritable object.

The data types contained in Nutchwritable are as follows:

CLASSES = new Class[] {

      org.apache.hadoop.io.NullWritable.class,

      org.apache.hadoop.io.BooleanWritable.class,

      org.apache.hadoop.io.LongWritable.class,

      org.apache.hadoop.io.BytesWritable.class,

      org.apache.hadoop.io.FloatWritable.class,

      org.apache.hadoop.io.IntWritable.class,

      org.apache.hadoop.io.MapWritable.class,

      org.apache.hadoop.io.Text.class,

      org.apache.hadoop.io.MD5Hash.class,

      org.apache.nutch.crawl.CrawlDatum.class,

      org.apache.nutch.crawl.Inlink.class,

      org.apache.nutch.crawl.Inlinks.class,

      org.apache.nutch.fetcher.FetcherOutput.class,

      org.apache.nutch.metadata.Metadata.class,

      org.apache.nutch.parse.Outlink.class,

      org.apache.nutch.parse.ParseText.class,

      org.apache.nutch.parse.ParseData.class,

      org.apache.nutch.parse.ParseImpl.class,

      org.apache.nutch.parse.ParseStatus.class,

      org.apache.nutch.protocol.Content.class,

      org.apache.nutch.protocol.ProtocolStatus.class,

      org.apache.nutch.scoring.webgraph.LinkDatum.class,

    };

These data types abstract the data types of Nutch at various stages of crawling.

The map stage does not process the value, but only the URL. The processing code is as follows:

   String urlString = filterUrl(normalizeUrl(key.toString()));

The call is to filter and format the URL according to the defined filtering rules. Of course, whether to perform this step can be set by the parameters when calling the command.

reduce is to process all crawled data, the code comments are as follows:

 /**

   * Output format: url as key, indexed action as value

   */

  public void reduce(Text key, Iterator<NutchWritable> values,

                     OutputCollector<Text, NutchIndexAction> output, Reporter reporter)

    throws IOException {

    Inlinks inlinks = null;

    CrawlDatum dbDatum = null;

    CrawlDatum fetchDatum = null;

    ParseData parseData = null;

    ParseText parseText = null;

 

    while (values.hasNext()) {

      final Writable value = values.next().get(); // unwrap

      //If it is the data type of URL injection

      if (value instanceof Inlinks) {

        inlinks = (Inlinks)value;

        //If it is a crawled data type

      } else if (value instanceof CrawlDatum) {

        final CrawlDatum datum = (CrawlDatum)value;

        //If the current data is in db injection state

        if (CrawlDatum.hasDbStatus(datum)) {

          dbDate = date;

        }

        //If the current data is in the crawling completion state.

        else if (CrawlDatum.hasFetchStatus(datum)) {

 

          // don't index unmodified (empty) pages

          //Determine whether the crawling has been modified

          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {

            fetchDatum = datum;

 

            /**

             * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT.

             */

            //If the parameter is set to delete to true, delete the wrong and outdated pages

            if (delete) {

            //If the crawled page expires, take the delete operation.

              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {

                reporter.incrCounter("IndexerStatus", "Documents deleted", 1);

 

                NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);

                output.collect(key, action);

                return;

              }

              //If the crawled page has been redirected to another page, delete the operation.

              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {

                reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);

 

                NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);

                output.collect(key, action);

                return;

              }

            }

          }

         // URLs are discovered through other URLs || The page's signature || The page's metadata is generated by the parser

        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||

                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||

                   CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {

          continue;

        } else {

          throw new RuntimeException("Unexpected status: "+datum.getStatus());

        }

       //if it is the parsed data type

      } else if (value instanceof ParseData) {

        parseData = (ParseData)value;

 

        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434

        if (deleteRobotsNoIndex) {

          // Get the robots meta data

          String robotsMeta = parseData.getMeta("robots");

 

          // Has it a noindex for this url?

          if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {

            // Delete it!

            NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);

            output.collect(key, action);

            return;

          }

        }

        //The parsed Text file

      } else if (value instanceof ParseText) {

        parseText = (ParseText)value;

      } else if (LOG.isWarnEnabled()) {

        LOG.warn("Unrecognized type: "+value.getClass());

      }

    }

    //If there is only a link, there is no record of crawling history or the crawling data is returned directly

    if (fetchDatum == null || dbDatum == null

        || parseText == null || parseData == null) {

      return;                                     // only have inlinks

    }

 

    // Whether to skip DB_NOTMODIFIED pages

    //If the page has been crawled but not repaired, skip it if skip is set in the incoming command.

    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

      reporter.incrCounter("IndexerStatus", "Skipped", 1);

      return;

    }

    //The page crawling is successful, but the parsing fails, return directly

    if (!parseData.getStatus().isSuccess() ||

        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {

      return;

    }

 

    NutchDocument doc = new NutchDocument();

    //Get the metadata of the page from the parsed data

    final Metadata metadata = parseData.getContentMeta();

 

    // add segment, used to map from merged index back to segment files

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // page summary

    // add digest, used by dedup

    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    

    final Parse parse = new ParseImpl(parseText, parseData);

    try {

      // extract information from dbDatum and pass it to

      // fetchDatum so that indexing filters can use it

      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);

      if (url != null) {

        fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);

      }

      // run indexing filters

      // execute all filters

      doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);

    } catch (final IndexingException e) {

      if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }

      reporter.incrCounter("IndexerStatus", "Errors", 1);

      return;

    }

 

    // skip documents discarded by indexing filters

    if (doc == null) {

      reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);

      return;

    }

 

    float boost = 1.0f;

    // run scoring filters

    //Execute the scoring filter

    try {

      boost = this.scfilters.indexerScore(key, doc, dbDatum,

              fetchDatum, parse, inlinks, boost);

    } catch (final ScoringFilterException e) {

      if (LOG.isWarnEnabled()) {

        LOG.warn("Error calculating score " + key + ": " + e);

      }

      return;

    }

    // Use the score as the weight of the document

    // apply boost to all indexed fields.

    doc.setWeight(boost);

    // store boost for use by explain and dedup

    doc.add("boost", Float.toString(boost));

 

    reporter.incrCounter("IndexerStatus", "Documents added", 1);

 

    NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);

    output.collect(key, action);

  }

The current research is here, and the following is to be continued. .

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=327010315&siteId=291194637