from pyspark.sql import SparkSession
from pyspark.sql import StructType, StructField, IntegerType,StringType
spark = SparkSession.builder.appName('test').getOrCreate()
1、 从列表中创建DataFrame
data = [(1,"alice"),(2,'Blob'),(3,'Charlie')]
columns = ["id","name"]
df = spark.createDataFrame(data,schema=columns)
df.show()
2、通过字典列表创建
data1 = [{'name':'Alice','age':25},{'name':'Bob','age':30}]df1 = spark.createDataFrame(data1)
df1.show()
3、从文件中读取
df2 =spark.read.csv("911.csv",header=True,inferSchema=True)
df2.show(5)
4、通过精确定义模式创建
schema = StructType([StructField("id",IntegerType, nullable=False),StructField("name", StringType, nullable=False),StructField("age",IntegerType, nullable=False)
])
data3 = [(1,"alice",28),(2,'Blob',33),(3,'Charlie',26)]
df3 = spark.createDataFrame(data3,schema=schema)
df3.show()
5、通过pandas 创建
import pandas as pdpandas_df = pd.DataFrame(data = {'name':['alice','bob'],'age':[23,24]}
)
pandas_df.head()df4 = spark.createDataFrame(pandas_df)
df4.show()
6、读取json 每行都是json
df5 = spark.read.json('info.json')
df5.show()