Spark QA

Apache Spark QA

集群模式运行时,报错需要指定Master

集群模式运行时,如果SparkContext在main方法外定义:

1
2
3
4
5
6
7
8
9
10
11
12
object ActivityToAddress extends Logging{
val conf = new SparkConf().setAppName("ActivityToAddress")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)

import sqlContext._
import sqlContext.implicits._

def main(args: Array[String]): Unit = {
sql("select * from activity").count
}
}

用spark-submit运行时,指定了master为mesos

1
2
3
4
5
6
7
8
9
10
11
/usr/install/spark/bin/spark-submit \
--master mesos://zk://192.168.6.55:2181,192.168.6.56:2181,192.168.6.57:2181/mesos \
--class cn.fraudmetrix.vulcan.address.ActivityToAddress \
--conf spark.address.year=2017 \
--conf spark.address.month=2 \
--conf spark.address.day=7 \
--conf spark.address.json=1 \
--conf spark.address.run="test" \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.kryoserializer.buffer.mb=24 \
/usr/install/spark-app-1.0.0-SNAPSHOT-jar-with-dependencies.jar

但还是报错:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:401)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$.<init>(ActivityToAddress.scala:20)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$.<clinit>(ActivityToAddress.scala)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$$anonfun$2.apply(ActivityToAddress.scala:46)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$$anonfun$2.apply(ActivityToAddress.scala:46)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
11:48:35.657 [Executor task launch worker-0] ERROR org.apache.spark.executor.Executor - Exception in task 5.0 in stage 3.0 (TID 91)
java.lang.ExceptionInInitializerError: null
at cn.fraudmetrix.vulcan.address.ActivityToAddress$$anonfun$2.apply(ActivityToAddress.scala:46)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$$anonfun$2.apply(ActivityToAddress.scala:46)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:401)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$.<init>(ActivityToAddress.scala:20)
at cn.fraudmetrix.vulcan.address.ActivityToAddress$.<clinit>(ActivityToAddress.scala)
... 14 common frames omitted

解决办法:初始化上下文移动到main方法中

1
2
3
4
5
6
7
8
9
10
11
12
object ActivityToAddress extends Logging{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ActivityToAddress")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)

import sqlContext._
import sqlContext.implicits._

sql("select * from activity").count
}
}

parquet碎文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
<property>
<name>hive.exec.compress.output</name>
<value>true</value>
</property>
<property>
<name>hive.exec.compress.intermediate</name>
<value>true</value>
</property>
<property>
<name>hive.intermediate.compression.codec</name>
<value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
<name>hive.intermediate.compression.type</name>
<value>BLOCK</value>
</property>
<property>
<name>hive.output.file.extension</name>
<value>.snappy.parquet</value>
</property>

文章目录
  1. 1. 集群模式运行时,报错需要指定Master
  2. 2. parquet碎文件