数据整理
- 在数据清洗过程中,很多时候需要将不同的数据整理在一起,方便后续的分析,这个过程也叫数据合并
- 常见的合并方法有堆叠和按主键进行合并,堆叠又分为横向堆叠和纵向堆叠,按主键合并类似于sql里面的关联操作
- 横向堆叠将两张表或多张表在X轴方向,即横向拼接在一起
- 纵向堆叠将两张表或多张表在Y轴方向,即纵向拼接在一起
- 注意join 取inner或者outer,分别代表交集和并集
# 用于读取excel文件
import xlrd
import os
import pandas as pd
import numpy as
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据表处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
workbook = xlrd.open_workbook('meal_order_detail.xlsx')
# 获取工作表的名称
sheet_name = workbook.sheet_names()
sheet_name
['meal_order_detail1', 'meal_order_detail2', 'meal_order_detail3']
order1 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail1')
order2 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail2')
order3 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail3')
# ignore_index=True, 表示重置索引
order = pd.concat([order1, order2, order3], axis=0, ignore_index=True)
# 获取末5行
order.tail(5)
detail_id | order_id | dishes_id | logicprn_name | parent_class_name | dishes_name | itemis_add | counts | amounts | cost | place_order_time | discount_amt | discount_reason | kick_back | add_inprice | add_info | bar_code | picture_file | emp_id |
10032 | 5683 | 672 | 610049 | NaN | NaN | 爆炒双丝 | 1 | 35 | NaN | 2016-08-31 21:53:30 | NaN | NaN | NaN | NaN | NaN | caipu/301003.jpg | 1089 |
10033 | 5686 | 672 | 609959 | NaN | NaN | 小炒羊腰\r\n\r\n\r\n | 1 | 36 | NaN | 2016-08-31 21:54:40 | NaN | NaN | NaN | NaN | NaN | caipu/202005.jpg | 1089 |
10034 | 5379 | 647 | 610012 | NaN | NaN | 香菇鹌鹑蛋 | 1 | 39 | NaN | 2016-08-31 21:54:44 | NaN | NaN | NaN | NaN | NaN | caipu/302001.jpg | 1094 |
10035 | 5380 | 647 | 610054 | NaN | NaN | 不加一滴油的酸奶蛋糕 | 1 | 7 | NaN | 2016-08-31 21:55:24 | NaN | NaN | NaN | NaN | NaN | caipu/501003.jpg | 1094 |
10036 | 5688 | 672 | 609953 | NaN | NaN | 凉拌菠菜 | 1 | 27 | NaN | 2016-08-31 21:56:54 | NaN | NaN | NaN | NaN | NaN | caipu/303004.jpg | 1089 |
basic = pd.DataFrame()
# 可以使用for循环进行合并
for i in sheet_name:
basic_i = pd.read_excel('meal_order_detail.xlsx', sheet_name=i)
basic = pd.concat([basic, basic_i], axis=0, ignore_index=True)
basic.shape
(10037, 19)
basic.tail(5)
detail_id | order_id | dishes_id | logicprn_name | parent_class_name | dishes_name | itemis_add | counts | amounts | cost | place_order_time | discount_amt | discount_reason | kick_back | add_inprice | add_info | bar_code | picture_file | emp_id |
10032 | 5683 | 672 | 610049 | NaN | NaN | 爆炒双丝 | 1 | 35 | NaN | 2016-08-31 21:53:30 | NaN | NaN | NaN | NaN | NaN | caipu/301003.jpg | 1089 |
10033 | 5686 | 672 | 609959 | NaN | NaN | 小炒羊腰\r\n\r\n\r\n | 1 | 36 | NaN | 2016-08-31 21:54:40 | NaN | NaN | NaN | NaN | NaN | caipu/202005.jpg | 1089 |
10034 | 5379 | 647 | 610012 | NaN | NaN | 香菇鹌鹑蛋 | 1 | 39 | NaN | 2016-08-31 21:54:44 | NaN | NaN | NaN | NaN | NaN | caipu/302001.jpg | 1094 |
10035 | 5380 | 647 | 610054 | NaN | NaN | 不加一滴油的酸奶蛋糕 | 1 | 7 | NaN | 2016-08-31 21:55:24 | NaN | NaN | NaN | NaN | NaN | caipu/501003.jpg | 1094 |
10036 | 5688 | 672 | 609953 | NaN | NaN | 凉拌菠菜 | 1 | 27 | NaN | 2016-08-31 21:56:54 | NaN | NaN | NaN | NaN | NaN | caipu/303004.jpg | 1089 |
df = pd.read_csv('baby_trade_history.csv', dtype={'user_id': str})
df1 = pd.read_csv('sam_tianchi_mum_baby.csv', dtype={'user_id': str})
# 基本信息数据
df1.head(5)
user_id | birthday | gender |
2757 | 20130311 | 1 |
1 | 415971 | 20121111 |
2 | 1372572 | 20120130 | 1 |
3 | 10339332 | 20110910 |
4 | 10642245 | 20130213 |
# 交易数据
df.head(5)
user_id | auction_id | cat_id | cat1 | property | buy_mount | day |
786295544 | 41098319944 | 50014866 | 50022520 | 21458:86755362;13023209:3593274;10984217:21985... | 2 | 20140919 |
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 |
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 |
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 |
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 |
# 将df1添加到df右边,关联字段为 "user_id"
df2 = pd.merge(left=df, right=df1, how='inner', left_on='user_id', right_on='user_id')
df2.head(5)
user_id | auction_id | cat_id | cat1 | property | buy_mount | day | birthday | gender |
513441334 | 19909384116 | 50010557 | 50008168 | 25935:21991;1628665:29784;22019:34731;22019:20... | 1 | 20121212 | 20110105 | 1 |
1 | 377550424 | 15771663914 | 50015841 | 28 | 1628665:3233941;1628665:3233942;3914866:11580;... | 1 | 20121123 | 20110620 | 1 |
2 | 47342027 | 14066344263 | 50013636 | 50008168 | 21458:21599;13585028:3416646;1628665:3233942;1... | 1 | 20120911 | 20101008 | 1 |
3 | 119784861 | 20796936076 | 50140021 | 50008168 | 21458:120325094;22019:2026;22019:34731;22019:3... | 1 | 20121129 | 20120327 |
4 | 159129426 | 15198386301 | 50013711 | 50008168 | 21458:11580;1628665:29778;22019:3340598;22019:... | 2 | 20120808 |