Pandas

Pandas 是基于NumPy 的一种工具，该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型，提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。

Pandas基于两种数据类型:series与dataframe。

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# 创建Series对象并省略索引
'''
index 参数是可省略的，你可以选择不输入这个参数。
如果不带 index 参数，Pandas 会自动用默认 index 进行索引，类似数组，索引值是 [0, ..., len(data) - 1] '''
sel = Series([1,2,3,4])
print(sel)
# 通常我们会自己创建索引
# sel = Series(data = [1,2,3,4], index = ['a','b','c','d'])
sel = Series(data = [1,2,3,4], index = list('abcd'))
print(sel)
# 获取内容
print(sel.values)
# 获取索引
print(sel.index)
# 获取索引和值对
print(list(sel.iteritems()))
输出：
0    1
1    2
2    3
3    4
dtype: int64
a    1
b    2
c    3
d    4
dtype: int64
[1 2 3 4]
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 1), ('b', 2), ('c', 3), ('d', 4)]

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# 将字典转换为Series
dict={"red":100,"black":400,"green":300,"pink":900}
se3=Series(dict)
print(se3)
# Series数据获取
sel = Series(data = [1,2,3,4], index = list('abcd'))
print(sel)
# Series对象同时支持位置和标签两种方式获取数据 print('索引下标',sel['c']) print('位置下标',sel[2])
# 获取不连续的数据
print('索引下标',sel[['a','c']])
print('位置下标',sel[[1,3]])
# 可以使用切片或取数据
print('位置切片',sel[1:3])# 左包含右不包含
print('索引切片',sel['b':'d'])# 左右都包含
# 重新赋值索引的值
sel.index = list('dcba')
print(sel)
# ReIndex重新索引,会返回一个新的Series(调用reindex将会重新排序，缺失值则用NaN填补)
print(sel.reindex(['b','a','c','d','e']))
# Drop丢弃指定轴上的项
se1=pd.Series(range(10,15))
print(se1)
print(se1.drop([2,3]))

输出：
red      100
black    400
green    300
pink     900
dtype: int64

a    1
b    2
c    3
d    4
dtype: int64

索引下标 a    1
c    3
dtype: int64

位置下标 b    2
d    4
dtype: int64

位置切片 b    2
c    3
dtype: int64

索引切片 b    2
c    3
d    4
dtype: int64

d    1
c    2
b    3
a    4
dtype: int64

b    3.0
a    4.0
c    2.0
d    1.0
e    NaN
dtype: float64

0    10
1    11
2    12
3    13
4    14
dtype: int64

0    10
1    11
4    14
dtype: int64

series算术运算操作

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
'''
对 Series 的算术运算都是基于 index 进行的。
我们可以用加减乘除(+ - * /)这样的运算符对两个 Series 进行运算，
Pandas 将会根据索引 index，对响应的数据进行计算，结果将会以浮点数的形式存储，以避免丢失精度。 如果 Pandas 在两个 Series 里找不到相同的 index，对应的位置就返回一个空值 NaN
'''
series1 = pd.Series([1,2,3,4],['London','HongKong','Humbai','lagos'])
series2 = pd.Series([1,3,6,4],['London','Accra','lagos','Delhi'])
print(series1-series2)
print("-"*20)
print(series1+series2)
print("-"*20)
print(series1*series2)
print("-"*20)
# 同样也支持numpy的数组运算
sel = Series(data = [1,6,3,5], index = list('abcd'))
print(sel[sel>2]) # 布尔数组过滤
print("-"*20)
print(sel*2) # 标量乘法
print("-"*20)
print(np.square(sel)) # 可以直接加入到numpy的数学函数
输出：
Accra       NaN
Delhi       NaN
HongKong    NaN
Humbai      NaN
London      0.0
lagos      -2.0
dtype: float64
--------------------
Accra        NaN
Delhi        NaN
HongKong     NaN
Humbai       NaN
London       2.0
lagos       10.0
dtype: float64
--------------------
Accra        NaN
Delhi        NaN
HongKong     NaN
Humbai       NaN
London       1.0
lagos       24.0
dtype: float64
--------------------
b    6
c    3
d    5
dtype: int64
--------------------
a     2
b    12
c     6
d    10
dtype: int64
--------------------
a     1
b    36
c     9
d    25
dtype: int64

DateFrame创建

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# 1. 创建DataFrame
# 使用二维数组
df1 = DataFrame(np.random.randint(0,10,(4,4)),index=[1,2,3,4],columns=['a','b','c','d'])
print(df1)
print("-"*20)
# 使用字典创建(行索引由index决定，列索引由字典的键决定)
dict={
    'Province': ['Guangdong', 'Beijing', 'Qinghai', 'Fujian'],
    'pop': [1.3, 2.5, 1.1, 0.7],
    'year': [2018, 2018, 2018, 2018]}
df2=pd.DataFrame(dict,index=[1,2,3,4])
print(df2)
print("-"*20)
# 使用from_dict
dict2={"a":[1,2,3],"b":[4,5,6]}
df6=pd.DataFrame.from_dict(dict2)
print(df6)
print("-"*20)
#索引相同的情况下，相同索引的值会相对应，缺少的值会添加NaN
data = {
'Name':pd.Series(['zs','ls','we'],index=['a','b','c']), 'Age':pd.Series(['10','20','30','40'],index=['a','b','c','d']), 'country':pd.Series(['中国','日本','韩国'],index=['a','c','b'])
}
df = pd.DataFrame(data)
print(df)
print("-"*20)
# to_dict()方法将DataFrame对象转换为字典
dict = df.to_dict()
print(dict)
输出：
   a  b  c  d
1  2  0  9  0
2  5  2  5  0
3  5  5  6  1
4  6  0  6  0
--------------------
    Province  pop  year
1  Guangdong  1.3  2018
2    Beijing  2.5  2018
3    Qinghai  1.1  2018
4     Fujian  0.7  2018
--------------------
   a  b
0  1  4
1  2  5
2  3  6
--------------------
  Name Age country
a   zs  10      中国
b   ls  20      韩国
c   we  30      日本
d  NaN  40     NaN
--------------------
{'Name': {'a': 'zs', 'b': 'ls', 'c': 'we', 'd': nan}, 'Age': {'a': '10', 'b': '20', 'c': '30', 'd': '40'}, 'country': {'a': '中国', 'b': '韩国', 'c': '日本', 'd': nan}}

相关推荐