上篇看到Injector的初始化,这次继续阅读,看看Mapper
public void map(WritableComparable key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text //忽略开头符号为#的 if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String,String> metadata = new TreeMap<String,String>(); //对文本进行拆分 if (url.indexOf("\t")!=-1){ String[] splits = url.split("\t"); url = splits[0]; for (int s=1;s<splits.length;s++){ // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals==-1) { // skip anything without a = continue; } //得到元数据名称及值 String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals+1); //判断是不是保留的元数据名称 if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue);} catch (NumberFormatException nfe){} } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue);} catch (NumberFormatException nfe){} } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue);} catch (NumberFormatException nfe){} } //不是就保存到容器中 else metadata.put(metaname,metavalue); } } try { //对url规范化,过滤 url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it //创建一个CrawlDatum,用来保存所处阶段,元数据,分值,计数等信息 CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()){ String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { //分值初始化 scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn("Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } } }