当前位置：首页 > news >正文

pandas库

news 2024/10/24 20:03:33

1、概述

Pandas 是一个开源的第三方 Python 库，从 Numpy 和 Matplotlib 的基础上构建而来
Pandas 名字衍生自术语 “panel data”（面板数据）和 “Python data analysis”（Python 数据分析）
Pandas 已经成为 Python 数据分析的必备高级工具，它的目标是成为强大、灵活、可以支持任何编程语言的数据分析工具
Pandas 是 Python 语言的一个扩展程序库，用于数据分析
Pandas 是一个开放源码、BSD 许可的库，提供高性能、易于使用的数据结构和数据分析工具
Pandas 一个强大的分析结构化数据的工具集，基础是 Numpy（提供高性能的矩阵运算）
Pandas 可以从各种文件格式比如 CSV、JSON、SQL、Microsoft Excel 导入数据
Pandas 可以对各种数据进行运算操作，比如归并、再成形、选择，还有数据清洗和数据加工特征
Pandas 广泛应用在学术、金融、统计学等各个数据分析领域
Pandas 的出现使得 Python 做数据分析的能力得到了大幅度提升，它主要实现了数据分析的五个重要环节：加载数据、整理数据、操作数据、构建数据模型、分析数据

2、安装

pip install pandas==1.1.5 -i https://pypi.tuna.tsinghua.edu.cn/simple/

3、方法示例

# pandas为必要包
# numpy和matplotlib作为支撑，在pandas中经常能用到
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def pd_series():# 创建空的Series对象series01 = pd.Series()print(series01)print()# 创建Series对象,使用列表进行初始化series02 = pd.Series(['张三','李四', '王五', '赵六'])print(series02)print()# 创建Series对象,并指定索引series03 = pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd'])print(series03)print()#使用nadarray创建Series对象arr = np.array([1,2,3,4])series04 = pd.Series(arr)print(series04)print()# 使用字典创建Series对象dic = {'name': '张三', 'age': 18, 'sex': '男'}series05 = pd.Series(dic)print(series05)print()# 使用标量创建Series对象# series06 = pd.Series(5)# 指定行索引(index)，按照行索引的数量进行创建，每行的值都是标量的值series06 = pd.Series(5, index=['a', 'b', 'c'])print(series06)if __name__ == '__main__':pd_series()

Series([], dtype: object)0    张三
1    李四
2    王五
3    赵六
dtype: objecta    1
b    2
c    3
d    4
dtype: int640    1
1    2
2    3
3    4
dtype: int32name    张三
age     18
sex      男
dtype: objecta    5
b    5
c    5
dtype: int64

def pd_series_atr():s0 = pd.Series(['a','b','c','d','e'])print(s0.axes)  # 获取series对象的索引信息print(s0.dtype) # 获取series对象的数据类型print(s0.empty) # 判断series对象是否为空print(s0.ndim)  # 获取series对象的维度print(s0.size)  # 获取series对象的元素个数print(s0.values)# 获取series对象的值，返回ndarray类型数组print(s0.index) # 获取series对象的索引信息if __name__ == '__main__':pd_series_atr()

[RangeIndex(start=0, stop=5, step=1)]
object
False
1
5
['a' 'b' 'c' 'd' 'e']
RangeIndex(start=0, stop=5, step=1)

def pd_series_method():s0 = pd.Series(['a','b','c','d','e',None])print(s0)print()print('前5个元素为:')print(s0.head())  # 获取前5个元素,默认前5个元素，可设置参数，指定元素个数print()print('后5个元素为:')print(s0.tail())  # 获取后5个元素，默认后5个元素，可设置参数，指定元素个数print()print('该对象元素为:')print(s0.isnull()) # 判断series对象中每个值是否为空print()print('该对象元素为:')print(s0.notnull()) # 判断series对象中每个值是否不为空print()print(s0.describe()) # 获取series对象的描述信息print()print(s0.sort_values()) # 对series对象进行排序print()print(s0.value_counts()) # 统计series对象中每个值出现的次数print()
if __name__ == '__main__':pd_series_method()

0       a
1       b
2       c
3       d
4       e
5    None
dtype: object前5个元素为:
0    a
1    b
2    c
3    d
4    e
dtype: object后5个元素为:
1       b
2       c
3       d
4       e
5    None
dtype: object该对象元素为:
0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool该对象元素为:
0     True
1     True
2     True
3     True
4     True
5    False
dtype: boolcount     5
unique    5
top       a
freq      1
dtype: object0       a
1       b
2       c
3       d
4       e
5    None
dtype: objecta    1
b    1
c    1
d    1
e    1
Name: count, dtype: int64

def pd_dataframe():# 创建空的DataFrame对象df01 = pd.DataFrame()print(df01)print()# 创建DataFrame对象,使用列表进行初始化df02 = pd.DataFrame(['张三', '李四', '王五', '赵六'])print(df02)print()# 创建DataFrame对象,并指定索引df03 = pd.DataFrame(['张三', '李四', '王五', '赵六'],columns=['name'])print(df03)print()# 使用二维数组创建DataFrame对象df04 = pd.DataFrame([['张三',21,'男'],['李四',20,'女'],['王五',19,'男']],columns=['name','age','sex'])print(df04)print()# 使用字典创建DataFrame对象df05 = pd.DataFrame({'name':['张三', '李四', '王五'],'age':[21, 20, 19],'sex':['男','女','男']})print(df05)print()# 使用Series对象创建DataFrame对象df06 = pd.DataFrame({'name':pd.Series(['张三', '李四', '王五']),'age':pd.Series([21, 20, 19]),'sex':pd.Series(['男','女','男'])})print(df06)print()if __name__ == '__main__':pd_dataframe()

Empty DataFrame
Columns: []
Index: []0
0  张三
1  李四
2  王五
3  赵六name
0   张三
1   李四
2   王五
3   赵六name  age sex
0   张三   21   男
1   李四   20   女
2   王五   19   男name  age sex
0   张三   21   男
1   李四   20   女
2   王五   19   男name  age sex
0   张三   21   男
1   李四   20   女
2   王五   19   男

import pandas as pd
def pd_col():df = pd.DataFrame({'name':['张三', '李四', '王五'],'age':[21, 20, 19],'sex':['男','女','男']})namelist = list(df['name'])print(namelist)print(df['age'])# df添加新列，可使用列表、nparray、Series添加# 列表和nparray添加时，列表长度必须与df的行数相同# Series添加时，Series的行数可以不一致，缺省的会自动补齐df['address'] = ['北京', '上海', '广州']df['sex'] = pd.Series(['male', 'female', 'male'])print(df)# insert()方法在指定位置添加新列# loc：指定列的索引值# column：指定添加列的名称# value：指定添加列的元素值df.insert(0,'id',pd.Series([0,1,2]))print(df)print()# 修改列# df.columns = ['ID', 'name', 'age', 'sex']# print(df)# 删除列# labels：指定删除的列名称# axis：指定删除的轴，0为行，1为列# inplace：是否在原数据上修改df.drop(labels='id', axis=1,inplace=True)print(f"df:{df}")if __name__ == '__main__':pd_col()

['张三', '李四', '王五']
0    21
1    20
2    19
Name: age, dtype: int64name  age     sex address
0   张三   21    male      北京
1   李四   20  female      上海
2   王五   19    male      广州id name  age     sex address
0   0   张三   21    male      北京
1   1   李四   20  female      上海
2   2   王五   19    male      广州df:  name  age     sex address
0   张三   21    male      北京
1   李四   20  female      上海
2   王五   19    male      广州

def pd_loc():data = {"name":['张三', '李四', '王五', '赵六'],"age":[21, 20, 19, 23],"sex":['男','女','男','男'],"address":['北京', '上海', '广州', '深圳']}print(data)print()# loc方法，根据索引获取DataFrame行或列，若获取一行或一列则返回Series对象，若获取多行或多列则返回DataFrame对象df = pd.DataFrame(data,index=['a', 'b', 'c' , 'd'])# 根据行索引标签获取a行的数据，返回结果是DataFrame对象,区间为闭区间print(df.loc['a'])print()# 通过切片方式获取a行到c行，返回结果是DataFrame对象print(df.loc['a':'c'])print()# 获取a行和B列对应数据，返回结果是一个数值print(df.loc['a',"name"])print()# 获取a行、c行和A列、C列数据，返回结果是DataFrame对象print(df.loc[['a', 'c'], ['name','age']])print() if __name__ == '__main__':pd_loc()

{'name': ['张三', '李四', '王五', '赵六'], 'age': [21, 20, 19, 23], 'sex': ['男', '女', '男', '男'], 'address': ['北京', '上海', '广州', '深圳']}name       张三
age        21
sex         男
address    北京
Name: a, dtype: objectname  age sex address
a   张三   21   男      北京
b   李四   20   女      上海
c   王五   19   男      广州张三name  age
a   张三   21
c   王五   19

def pd_iloc():data = {"name":['张三', '李四', '王五', '赵六'],"age":[21, 20, 19, 23],"sex":['男','女','男','男'],"address":['北京', '上海', '广州', '深圳']}df = pd.DataFrame(data,index=['a', 'b', 'c' , 'd'])# iloc方法，根据行索引所在位置进行获取，不能通过行索引或列索引标签获取数据# 获取行索引位置为0的行数据，返回结果是DataFrame对象print(df.iloc[0])print()# 通过切片方式获取行索引位置为0到2的行数据，返回结果是DataFrame对象，区间为左闭右开print(df.iloc[0:2])print()#获取行索引和列索引位置的数据，返回结果为一个数值print(df.iloc[0,1])print()#通过多个行索引位置和多个列索引位置获取数据，返回结果是DataFrame对象print(df.iloc[[0, 2], [0, 2]])print()# 直接获取第0行到第1行的数据，区间左闭右开print(df[0:2])if __name__ == '__main__':pd_iloc()

name       张三
age        21
sex         男
address    北京
Name: a, dtype: objectname  age sex address
a   张三   21   男      北京
b   李四   20   女      上海21name sex
a   张三   男
c   王五   男name  age sex address
a   张三   21   男      北京
b   李四   20   女      上海

def pd_append():data = {"A":[1,2,3],"B":[4,5,6],"C":[7,8,9]}df = pd.DataFrame(data,index=['a', 'b', 'c'])s = pd.Series([10,20,30],name = 'd')# 使用concat方法将Series添加到DataFramedf1 = pd.concat([df, s.to_frame().T], axis=0)print(df1)print()# drop:删除行，axis = 0，即按行删除#根据行标签删除数据df2 = df1.drop(['d'],axis=0)print(df2)
if __name__ == '__main__':pd_append()

     A    B    C     0     1     2
a  1.0  4.0  7.0   NaN   NaN   NaN
b  2.0  5.0  8.0   NaN   NaN   NaN
c  3.0  6.0  9.0   NaN   NaN   NaN
d  NaN  NaN  NaN  10.0  20.0  30.0A    B    C   0   1   2
a  1.0  4.0  7.0 NaN NaN NaN
b  2.0  5.0  8.0 NaN NaN NaN
c  3.0  6.0  9.0 NaN NaN NaN

查看全文

http://www.mrgr.cn/news/57807.html