目录
定义
基本操作
一维数组操作
二维数组操作
数据选择过滤
数据处理
数据清洗
数据转换
数据分析
排序
分组聚合
数据透视表
高级操作
合并数据
时间序列处理
自定义函数调用
数据可视化集成
数据导出和导入
大数据分块处理
定义
全称: 'panel data' and 'python data analysis'
Analy: Series(一维数据)、DataFrame(二维数据)
主要应用: 数据清洗:处理缺失数据、重复数据等
数据转换:改变数据的形状、结构或格式
数据分析: 进行统计、聚合、分组
数据可视化:整合Matplotlib 进行数据可视化
基本操作
一维数组操作
import pandas as pda = [1, 2, 3]
mystr = pd.Series(a)
print('列表')
print(mystr)a = ['a', 'b', 'c']
mystr = pd.Series(a, index=['x', 'y', 'z'])
print('自定义列名')
print(mystr)str_dict = {'1': 'a', '2': 'b'}
mystr = pd.Series(str_dict)
print('字典类型')
print(mystr)#############运行结果########################
# 列表
# 0 1
# 1 2
# 2 3
# dtype: int64
# 自定义列名
# x a
# y b
# z c
# dtype: object
# 字典类型
# 1 a
# 2 b
# dtype: object
二维数组操作
import pandas as pd
import numpy as np# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
print("从字典创建的DataFrame:")
print(df)# 从列表创建
data = [['Apple', 10], ['Banana', 15], ['Orange', 8]]
df_fruits = pd.DataFrame(data, columns=['Fruit', 'Price'])
print("\n从列表创建的DataFrame:")
print(df_fruits)# 查看前几行
print("\n前两行:")
print(df.head(2))# 查看基本信息
print("\nDataFrame信息:")
print(df.info())# 查看统计信息
print("\n统计信息:")
print(df.describe())# 查看列名
print("\n列名:")
print(df.columns)# 查看形状
print("\n形状(行,列):")
print(df.shape)#############运行结果################
# 从字典创建的DataFrame:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
# 2 Charlie 35 London
# 3 David 40 Tokyo
#
# 从列表创建的DataFrame:
# Fruit Price
# 0 Apple 10
# 1 Banana 15
# 2 Orange 8
#
# 前两行:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
#
# DataFrame信息:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Name 4 non-null object
# 1 Age 4 non-null int64
# 2 City 4 non-null object
# dtypes: int64(1), object(2)
# memory usage: 224.0+ bytes
# None
#
# 统计信息:
# Age
# count 4.000000
# mean 32.500000
# std 6.454972
# min 25.000000
# 25% 28.750000
# 50% 32.500000
# 75% 36.250000
# max 40.000000
#
# 列名:
# Index(['Name', 'Age', 'City'], dtype='object')
#
# 形状(行,列):
# (4, 3)
数据选择过滤
import pandas as pd
import numpy as np# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)# 选择列
print("\n选择Name列:")
print(df['Name'])# 选择多列
print("\n选择Name和Age列:")
print(df[['Name', 'Age']])# 选择行
print("\n选择第二行:")
print(df.iloc[1])# 选择多行
print("\n选择前两行:")
print(df.iloc[0:2])# 条件选择
print("\n选择年龄大于30的人:")
print(df[df['Age'] > 30])# 使用loc选择
print("\n使用loc选择:")
print(df.loc[df['Age'] > 30, ['Name', 'City']])############运行结果######################
# 选择Name列:
# 0 Alice
# 1 Bob
# 2 Charlie
# 3 David
# Name: Name, dtype: object
#
# 选择Name和Age列:
# Name Age
# 0 Alice 25
# 1 Bob 30
# 2 Charlie 35
# 3 David 40
#
# 选择第二行:
# Name Bob
# Age 30
# City Paris
# Name: 1, dtype: object
#
# 选择前两行:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
#
# 选择年龄大于30的人:
# Name Age City
# 2 Charlie 35 London
# 3 David 40 Tokyo
#
# 使用loc选择:
# Name City
# 2 Charlie London
# 3 David Tokyo
数据处理
数据清洗
import pandas as pd
import numpy as np# 创建有缺失值的DataFrame
data = {'A': [1, 2, np.nan, 4],'B': [5, np.nan, np.nan, 8],'C': [10, 20, 30, 40]}
df_missing = pd.DataFrame(data)
print("\n有缺失值的DataFrame:")
print(df_missing)# 检查缺失值
print("\n缺失值统计:")
print(df_missing.isnull().sum())# 填充缺失值
print("\n填充缺失值:")
print(df_missing.fillna(value={'A': 0, 'B': df_missing['B'].mean()}))# 删除缺失值
print("\n删除包含缺失值的行:")
print(df_missing.dropna())# 删除全为缺失值的列
print("\n删除全为缺失值的列:")
print(df_missing.dropna(axis=1, how='all'))#################运行结果#############################
# 有缺失值的DataFrame:
# A B C
# 0 1.0 5.0 10
# 1 2.0 NaN 20
# 2 NaN NaN 30
# 3 4.0 8.0 40
#
# 缺失值统计:
# A 1
# B 2
# C 0
# dtype: int64
#
# 填充缺失值:
# A B C
# 0 1.0 5.0 10
# 1 2.0 6.5 20
# 2 0.0 6.5 30
# 3 4.0 8.0 40
#
# 删除包含缺失值的行:
# A B C
# 0 1.0 5.0 10
# 3 4.0 8.0 40
#
# 删除全为缺失值的列:
# C
# 0 10
# 1 20
# 2 30
# 3 40
数据转换
import pandas as pd# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
print(df.dtypes)# 类型转换
df['Age'] = df['Age'].astype('float')
print("\n转换Age列为浮点型:")
print(df.dtypes)# 重命名列
df_renamed = df.rename(columns={'Name': 'Full Name', 'City': 'Location'})
print("\n重命名列后:")
print(df_renamed.columns)# 替换值
df_replaced = df.replace({'Paris': 'Rome', 'London': 'Berlin'})
print("\n替换值后:")
print(df_replaced)# 应用函数
print("\n应用函数:")
print(df['Age'].apply(lambda x: x + 1))# 字符串操作
print("\n字符串操作:")
print(df['Name'].str.upper())
数据分析
排序
import pandas as pd# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 按列排序
print("\n按Age降序排序:")
print(df.sort_values('Age', ascending=False)) # ascending=False, 反序# 多列排序
print("\n先按City升序,再按Age降序排序:")
print(df.sort_values(['City', 'Age'], ascending=[True, False]))
分组聚合
import pandas as pd# 创建示例数据
data = {'Department': ['Sales', 'Sales', 'IT', 'IT', 'HR', 'HR'],'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],'Salary': [5000, 6000, 7000, 8000, 5500, 6500]}
df_company = pd.DataFrame(data)# 分组聚合
print("\n按部门计算平均工资:")
print(df_company.groupby('Department')['Salary'].mean())# 多聚合函数
print("\n按部门计算工资的多种统计量:")
print(df_company.groupby('Department')['Salary'].agg(['mean', 'max', 'min', 'count']))# 多列分组
print("\n按部门分组并显示员工列表:")
print(df_company.groupby('Department')['Employee'].apply(list))#############运行结果##################
# 按部门计算平均工资:
# Department
# HR 6000.0
# IT 7500.0
# Sales 5500.0
# Name: Salary, dtype: float64
#
# 按部门计算工资的多种统计量:
# mean max min count
# Department
# HR 6000.0 6500 5500 2
# IT 7500.0 8000 7000 2
# Sales 5500.0 6000 5000 2
#
# 按部门分组并显示员工列表:
# Department
# HR [Eve, Frank]
# IT [Charlie, David]
# Sales [Alice, Bob]
# Name: Employee, dtype: object
数据透视表
import pandas as pd
import numpy as np# 创建示例数据
data = {'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],'Product': ['A', 'B', 'A', 'B'],'Sales': [100, 150, 200, 50]}
df_sales = pd.DataFrame(data)# 创建透视表
pivot = pd.pivot_table(df_sales, values='Sales', index='Date', columns='Product', aggfunc=np.sum)
print("\n透视表:")
print(pivot)# 添加汇总
pivot_margins = pd.pivot_table(df_sales, values='Sales', index='Date', columns='Product',aggfunc=np.sum, margins=True, margins_name='Total')
print("\n带汇总的透视表:")
print(pivot_margins)# 透视表:
# Product A B
# Date
# 2023-01-01 100 150
# 2023-01-02 200 50
#
# 带汇总的透视表:
# Product A B Total
# Date
# 2023-01-01 100 150 250
# 2023-01-02 200 50 250
# Total 300 200 500
高级操作
合并数据
import pandas as pd# 创建两个DataFrame
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['B', 'C', 'D'], 'value2': [4, 5, 6]})# 内连接
print("\n内连接:")
print(pd.merge(df1, df2, on='key', how='inner'))# 左连接
print("\n左连接:")
print(pd.merge(df1, df2, on='key', how='left'))# 左连接
print("\n右连接:")
print(pd.merge(df1, df2, on='key', how='right'))# 外连接
print("\n外连接:")
print(pd.merge(df1, df2, on='key', how='outer'))# 连接多个键
df3 = pd.DataFrame({'key1': ['A', 'B', 'C'], 'key2': ['X', 'Y', 'Z'], 'value': [10, 20, 30]})
df4 = pd.DataFrame({'key1': ['B', 'C', 'D'], 'key2': ['Y', 'Z', 'W'], 'value': [40, 50, 60]})
print("\n多键连接:")
print(pd.merge(df3, df4, on=['key1', 'key2'], how='inner'))
时间序列处理
import pandas as pd
import numpy as np# 创建时间序列数据
dates = pd.date_range('20230101', periods=6)
df_time = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print("\n时间序列DataFrame:")
print(df_time)# 时间索引选择
print("\n选择特定日期范围:")
print(df_time['2023-01-02':'2023-01-04'])# 重采样
print("\n按月重采样求均值:")
print(df_time.resample('ME').mean())###################运行结果#########################
# 时间序列DataFrame:
# A B C D
# 2023-01-01 -0.800839 -0.953824 0.958557 1.044269
# 2023-01-02 0.203414 1.462324 0.099291 0.457230
# 2023-01-03 -0.113450 0.441178 -0.779749 1.158564
# 2023-01-04 -0.741451 -1.353738 1.906289 2.513789
# 2023-01-05 -1.719984 -0.305153 -1.283332 0.728910
# 2023-01-06 -0.363050 -1.591411 -2.289244 -0.076304
#
# 选择特定日期范围:
# A B C D
# 2023-01-02 0.203414 1.462324 0.099291 0.457230
# 2023-01-03 -0.113450 0.441178 -0.779749 1.158564
# 2023-01-04 -0.741451 -1.353738 1.906289 2.513789
#
# 按月重采样求均值:
# A B C D
# 2023-01-31 -0.589227 -0.383437 -0.231365 0.971077
自定义函数调用
import pandas as pd# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 使用apply应用自定义函数
def age_group(age):if age < 30:return 'Young'elif age < 40:return 'Middle'else:return 'Senior'print("\n应用自定义函数:")
df['Age Group'] = df['Age'].apply(age_group)
print(df)# 使用pipe进行方法链
def add_prefix(df, prefix):df['Name'] = prefix + df['Name']return dfprint("\n使用pipe:")
print(df.pipe(add_prefix, prefix='Mr. '))#####################运行结果###########################
# 应用自定义函数:
# Name Age City Age Group
# 0 Alice 25 New York Young
# 1 Bob 30 Paris Middle
# 2 Charlie 35 London Middle
# 3 David 40 Tokyo Senior
#
# 使用pipe:
# Name Age City Age Group
# 0 Mr. Alice 25 New York Young
# 1 Mr. Bob 30 Paris Middle
# 2 Mr. Charlie 35 London Middle
# 3 Mr. David 40 Tokyo Senior
数据可视化集成
import pandas as pdimport matplotlib.pyplot as plt# 创建示例数据
df_plot = pd.DataFrame({'Year': [2010, 2011, 2012, 2013, 2014],'Sales': [100, 120, 150, 180, 200],'Profit': [20, 25, 30, 35, 40]
})# 绘制折线图
df_plot.plot(x='Year', y=['Sales', 'Profit'], kind='line', title='Sales and Profit Over Years')
plt.show()# 绘制柱状图
df_plot.plot(x='Year', y='Sales', kind='bar', title='Sales by Year')
plt.show()# 绘制散点图
df_plot.plot(x='Sales', y='Profit', kind='scatter', title='Profit vs Sales')
plt.show()
运行结果:
数据导出和导入
import pandas as pd# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 导出到CSV
df.to_csv('output.csv', index=False)# 从CSV导入
df_imported = pd.read_csv('output.csv')
print("\n从CSV导入的数据:")
print(df_imported.head())# 导出到Excel
df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)# 从Excel导入
df_excel = pd.read_excel('output.xlsx', sheet_name='Sheet1')
print("\n从Excel导入的数据:")
print(df_excel.head())#####################运行结果################################
# 从CSV导入的数据:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
# 2 Charlie 35 London
# 3 David 40 Tokyo
#
# 从Excel导入的数据:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
# 2 Charlie 35 London
# 3 David 40 Tokyo
大数据分块处理
import pandas as pd# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'City': ['Paris', 'Paris', 'London', 'London']}
df = pd.DataFrame(data)
# 导出到CSV
df.to_csv('output.csv', index=False)# 分块读取大文件
chunk_size = 3
chunks = pd.read_csv('output.csv', chunksize=chunk_size)# 分块处理
results = []
for chunk in chunks:processed = chunk.groupby('City')['Age'].sum()results.append(processed)# 合并结果
final_result = pd.concat(results).groupby(level=0).sum()
print(final_result)############运行结果##########################
# City
# London 75
# Paris 55
# Name: Age, dtype: int64
参考来源: deepseek。。。