spark2.1 yarn cluster作业失败AnnotatedConnectException: Connection refused

作业提交脚本
[root@tony-client-1-001 ~]# vim /mnt/tony/rec_model/model/data/f0.sh

#!/usr/bin/env bash

curr_dir=`pwd`
src_path=${curr_dir}
spark_lib_path='/usr/hdp/2.5.0.0-1245/spark/lib'
hbase_lib_path='/usr/hdp/2.5.0.0-1245/hbase/lib'

echo ${spark_lib_path}
echo ${hbase_lib_path}

spark-submit \
  --master yarn \
  --deploy-mode cluster \
  --num-executors 4 \
  --executor-memory 4G \
  --executor-cores 4 \
  --driver-memory 4G \
  --queue fintech \
  --name 'yhl_f_0' \
  --jars ${spark_lib_path}/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,${hbase_lib_path}/hbase-server.jar,${hbase_lib_path}/hbase-protocol.jar,${hbase_lib_path}/hbase-hadoop2-compat.jar,${hbase_lib_path}/hbase-client.jar,${hbase_lib_path}/hbase-common.jar,${hbase_lib_path}/htrace-core-3.1.0-incubating.jar,/usr/hdp/2.6.1.0-129/hadoop/lib/hadoop-ks3-0.1.jar \
  --driver-class-path ${spark_lib_path}/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,${hbase_lib_path}/hbase-server.jar:${hbase_lib_path}/hbase-protocol.jar:${hbase_lib_path}/hbase-hadoop2-compat.jar:${hbase_lib_path}/hbase-client.jar:${hbase_lib_path}/hbase-common.jar:${hbase_lib_path}/htrace-core-3.1.0-incubating.jar \
  --conf spark.executor.extraClassPath=${spark_lib_path}/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,${hbase_lib_path}/hbase-server.jar:${hbase_lib_path}/hbase-protocol.jar:${hbase_lib_path}/hbase-hadoop2-compat.jar:${hbase_lib_path}/hbase-client.jar:${hbase_lib_path}/hbase-common.jar:${hbase_lib_path}/htrace-core-3.1.0-incubating.jar \
  ${src_path}/f0.py

和spark作业相关的python代码
[root@tony-client-1-001 ~]# vim /mnt/tony/rec_model/model/data/f0.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import datetime

# os.environ['SPARK_HOME']="E:/sort/jars/spark-2.1.0-bin-hadoop2.7"
# sys.path.append("E:/sort/jars/spark-2.1.0-bin-hadoop2.7/python")

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.rdd import StorageLevel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType ,StructField , StringType , Row

g_conf = {}

def show(x) :
    print "############ " , x

def initConfig(conf):
    for i in conf :
        g_conf[i] = conf[i]

def init(conf):
    initConfig(conf)

def f1(x):
    srr = x.encode("utf-8").split("\t")
    y = srr[0]
    uuid = srr[1]
    newsid = srr[2]
    recid = srr[3]
    click_tstp = srr[4]
    cateid = srr[5]
    click_region = srr[6]
    uuid_group = srr[7]
    timestamp = srr[8]
    article_id = srr[9]
    lda_title = srr[10]
    lda_content = srr[11]
    doc2vec_title = srr[12]

    year = int(timestamp[0:4])
    month = int(timestamp[4:6])
    day = int(timestamp[6:])
    weeknum = datetime.date(year, month, day).isocalendar()[1] - 1
    weeknum = str(weeknum)

    return ((uuid , weeknum) ,  (y , uuid , newsid , recid , click_tstp , cateid , click_region , uuid_group ,
               timestamp , article_id , lda_title ,lda_content , doc2vec_title , weeknum))

def f2(x):
    srr = x.encode("utf-8").split("\t")
    uuid = srr[0]
    preference = srr[1]
    frequence = srr[2]
    weeknum = srr[3]

    return ((uuid , weeknum) ,(uuid , preference , frequence , weeknum))

def f3(x) :
    r0 = x[1][0]
    r1 = x[1][1]

    y = r0[0]
    uuid = r0[1]
    newsid = r0[2]
    recid = r0[3]
    click_tstp = r0[4]
    cateid = r0[5]
    click_region = r0[6]
    uuid_group = r0[7]
    timestamp = r0[8]
    article_id = r0[9]
    lda_title = r0[10]
    lda_content = r0[11]
    doc2vec_title = r0[12]
    preference = r1[1]
    frequence = r1[2]

    result = [y , uuid , newsid , recid , click_tstp , cateid , click_region , uuid_group , timestamp , article_id ,
              lda_title , lda_content , doc2vec_title , preference , frequence]

    return "\t".join(result)


def features(sc) :
    rec_features = g_conf["rec_features"]
    user_profile_features = g_conf["user_profile_features"]
    ks3_tmp_features = g_conf["ks3_tmp_features"]

    rdd = sc.textFile(rec_features).map(f1)
    rdd1 = sc.textFile(user_profile_features).map(f2)
    rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)
    rdd1.persist(StorageLevel.MEMORY_AND_DISK_SER)
    rdd2 = rdd.join(rdd1).map(f3)
    rdd2.saveAsTextFile(ks3_tmp_features)

def execute(conf):
    init(conf)
    sparkconf = SparkConf()
    sc = SparkContext(conf=sparkconf)
    features(sc)
    sc.stop()


if __name__ == "__main__":
    conf = {
        "cols":"y,uuid,newsid,recid,click_tstp,cateid,click_region,uuid_group,timestamp,article_id,lda_title,lda_content,"
               "doc2vec_title,lda_update_time,doc2vec_update_time,doc2vec_title_20171221,doc2vec20171221_update_time,mix_ldadoc2vec_update_time,"
               "mix_ldadoc2vec_title,mix_ldadoc2vec_update_time_test20180516,mix_ldadoc2vec_title_test20180516",
        "format": "%Y-%m-%d",
        "rec_features": "ks3://tony-fintech/recsys/project=rec_features/main_rec_features/",
        "user_profile_features": "ks3://tony-fintech/recsys/project=rec_features/user_profile/year=2018",
        "ks3_tmp_features":"ks3://tony-fintech/user_profile/project=yanghaolan/rec_sys/lr/tmp/feature0/",
    }
    execute(conf)

作业applicationId的yarn日志摘要——

18/06/11 20:22:56 ERROR ShuffleBlockFetcherIterator: Failed to get block(s) from kmr-a20125dd-gn-a05044c6-core-1-003.ksc.com:6723
java.lang.NullPointerException: group
        at io.netty.bootstrap.AbstractBootstrap.group(AbstractBootstrap.java:80)
        at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:205)
        at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:182)
        at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:97)
        at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:141)
        at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
        at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:171)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)
        at java.lang.Thread.run(Thread.java:745)
... ...
java.io.IOException: Failed to connect to /172.31.0.27:21469
        at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:232)
        at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:182)
        at org.apache.spark.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:197)
        at org.apache.spark.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
        at org.apache.spark.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
... ...
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /172.31.0.27:21469
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
        at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:257)
        at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:291)
        at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:640)
        at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575)
        at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451)
        at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140)
        at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)

怀疑是作业在持久化到内存时发生了内存溢出，那么部分结果数据被删除了，导致下一步的获取该数据时就找不到了，就报错了。
尝试把持久化级别换成memory and disk，即在spark应用程序中使用rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

重新运行作业，失败了。再次查看日志——

18/06/12 14:35:15 ERROR CoarseGrainedExecutorBackend: RECEIVED SIGNAL TERM
18/06/12 14:35:15 INFO DiskBlockManager: Shutdown hook called
18/06/12 14:35:15 ERROR Executor: Exception in task 135.1 in stage 0.0 (TID 266)
java.io.FileNotFoundException: /mnt/yarn/local/usercache/root/appcache/application_1525783734500_1219/blockmgr-406ab4d5-0c66-4cba-bb03-11430ec9e2f1/09/shuffle_0_135_0.data.8b28bb76-9a1b-44cb-8a6f-d3922aabf896 (No such file or directory)
        at java.io.FileOutputStream.open0(Native Method)
        at java.io.FileOutputStream.open(FileOutputStream.java:270)
        at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
        at org.spark_project.guava.io.Files$FileByteSink.openStream(Files.java:223)
        at org.spark_project.guava.io.Files$FileByteSink.openStream(Files.java:211)
        at org.spark_project.guava.io.ByteSource.copyTo(ByteSource.java:203)
        at org.spark_project.guava.io.Files.copy(Files.java:436)
        at org.spark_project.guava.io.Files.move(Files.java:651)
        at org.apache.spark.shuffle.sort.UnsafeShuffleWriter.mergeSpills(UnsafeShuffleWriter.java:277)
        at org.apache.spark.shuffle.sort.UnsafeShuffleWriter.closeAndWriteOutput(UnsafeShuffleWriter.java:216)
        at org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:169)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
        at org.apache.spark.scheduler.Task.run(Task.scala:99)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
... ...
18/06/12 14:34:24 ERROR YarnClusterScheduler: Lost executor 2 on kmr-a20125dd-gn-a05044c6-core-1-008.ksc.com: Container killed by YARN for exceeding memory limits. 5.1 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 14:35:16 ERROR YarnClusterScheduler: Lost executor 1 on kmr-a20125dd-gn-a05044c6-core-1-001.ksc.com: Container killed by YARN for exceeding memory limits. 5.0 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 14:40:32 ERROR YarnClusterScheduler: Lost executor 3 on kmr-a20125dd-gn-a05044c6-core-1-008.ksc.com: Container killed by YARN for exceeding memory limits. 5.1 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 14:41:39 ERROR YarnClusterScheduler: Lost executor 4 on kmr-a20125dd-gn-a05044c6-core-1-001.ksc.com: Container killed by YARN for exceeding memory limits. 5.1 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 14:48:11 ERROR YarnClusterScheduler: Lost executor 6 on kmr-a20125dd-gn-a05044c6-core-1-001.ksc.com: Container killed by YARN for exceeding memory limits. 5.1 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 14:46:31 ERROR YarnClusterScheduler: Lost executor 5 on kmr-a20125dd-gn-a05044c6-core-1-008.ksc.com: Container killed by YARN for exceeding memory limits. 5.0 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 15:14:37 ERROR YarnClusterScheduler: Lost executor 32 on kmr-a20125dd-gn-a05044c6-core-1-004.ksc.com: Container killed by YARN for exceeding memory limits. 5.0 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
... ...
18/06/12 15:15:01 ERROR YarnClusterScheduler: Lost executor 31 on kmr-a20125dd-gn-a05044c6-core-1-003.ksc.com: Container killed by YARN for exceeding memory limits. 5.0 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.

之前的2次失败作业，每个作业处理的数据总量都是最近2星期的，修改为每次作业处理的数据量为1天，就成功了。
提交作业的参数如下：

org.apache.spark.deploy.SparkSubmit \
--master yarn \
--deploy-mode client \
--conf spark.driver.memory=2G \
--conf spark.executor.extraClassPath=/usr/hdp/2.5.0.0-1245/spark/lib/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-server.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-protocol.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-hadoop2-compat.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-client.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-common.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/htrace-core-3.1.0-incubating.jar \
--conf spark.driver.extraClassPath=/usr/hdp/2.5.0.0-1245/spark/lib/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-server.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-protocol.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-hadoop2-compat.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-client.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-common.jar:/usr/hdp/2.5.0.0-1245/hbase/lib/htrace-core-3.1.0-incubating.jar \
--jars /usr/hdp/2.5.0.0-1245/spark/lib/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-server.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-protocol.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-hadoop2-compat.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-client.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/hbase-common.jar,/usr/hdp/2.5.0.0-1245/hbase/lib/htrace-core-3.1.0-incubating.jar,/usr/hdp/2.6.1.0-129/hadoop/lib/hadoop-ks3-0.1.jar \
--num-executors 4 \
--executor-memory 4G \
--executor-cores 4 \
--name tony_f_0_f1 \
/mnt/code/yanghaolan/recsys_engine_model/model/data/workflow/lr_0/f1.py

好了就写到这里，看官们觉得涨知识了，请在文章左侧点个赞 ^_^

spark2.1 yarn cluster作业失败AnnotatedConnectException: Connection refused

猜你喜欢