Pyspark & Pandas
pyspark常用操作
spark连接1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45from pyspark.sql import SparkSession
from pyspark.shell import sc
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import ArrayType
class SparkUtils:
def __init__(self):
self.spark = None
def get_spark(self):
if self.spark is None:
self.spark = SparkSession.builder.appName("username") \
.enableHiveSupport().config("spark.sql.shuffle.partitions", "500") \
.config("spark.sql.broadcastTimeout", "3600") \
.config("spark.driver.memory", "200g") \
.config("spark.executor.memory", "40g") \
.config("spark.yarn.appMasterEnv.yarn.nodemanager.container-executor.class", "DockerLinuxContainer") \
.config("spark.executorEnv.yarn.nodemanager.container-executor.class", "DockerLinuxContainer") \
.config("spark.yarn.appMasterEnv.yarn.nodemanager.docker-container-executor.image-name",
"bdp-docker.jd.com:5000/wise_mart_bag:latest") \
.config("spark.executorEnv.yarn.nodemanager.docker-container-executor.image-name",
"bdp-docker.jd.com:5000/wise_mart_bag:latest") \
.getOrCreate()
return self.spark
spark = SparkUtils()
# 生成dataframe
spark_data = spark.sql("""
select
id,
username,
num
from
table1
where
status in (1, 2, 3)
and dt = '{}'
""".format(date))
# 创建sql数据表
sp_test.createOrReplaceTempView('data')
常用命令
参考:
1 | # 创建第一个dataframe |
pandas常用操作
1 | import pandas as pd |