python数据分析-数据处理
数据导入:




导入csv from pandas import read_csv;df = read_csv('D://PA//4.1//1.csv') 导入文本,要转成UTF-8无BOM格式: from pandas import read_table;df = read_table('D://PA//4.1//2.txt') 导入excle from pandas import read_excel;df = read_excel('C:/PA/4.1/3.xlsx')View Code
数据导出:
数据的导出:

from pandas import DataFrame;df = DataFrame({'age': [21, 22, 23], 'name': ['KEN', 'John', 'JIMI'] });df.to_csv("c:/PA/4.1/df.csv");#不导入序号 df.to_csv("c:/PA/4.1/df.csv", index=False);View Code
重复值处理:


from pandas import read_csv;df = read_csv('C:/PA/4.1/data.csv')newDF = df.drop_duplicates();View Code
缺失值处理:


from pandas import read_csv;df = read_csv('C:/PA/4.4/data.csv')newDF = df.dropna();View Code
空格值处理:

from pandas import read_csv;df = read_csv('C:/PA/4.5/data.csv')newDF = df["name"].str.strip(); df["name"]=newDF;View Code
字段抽取:

astype(str) 转换成字符型数据,以便于处理。
from pandas import read_csv;df = read_csv('C:/PA/4.6/data.csv')df["tel"]=df["tel"].astype(str);bands=df["tel"].str.slice(0,3);areas=df["tel"].str.slice(3,7);numbs=df["tel"].str.slice(7,11);View Code
字段拆分:

from pandas import read_csv; df=read_csv("C:/PA/4.7/data.csv");newDF=df["name"].str.split(" ",1,True); newDF.columns=["band","name"];View Code
记录抽取:



import pandas; from pandas import read_csv; df=read_csv("C:/PA/4.8/data.csv",sep="|"); df[df.comments>1000]; df[df.comments.between(1000,10000)]; df[pandas.isnull(df.title)]; df[df.title.str.contains("台电",na=False)]; df[(df.comments>=1000)&(df.comments<=10000)]View Code
随机抽样:

import numpy; from pandas import read_csv; df=read_csv("C:/PA/4.9/data.csv"); r=numpy.random.randint(0,10,3); df.loc[r,:];View Code
记录合并 :


import pandas; from pandas import read_csv;df1=read_csv("C:/PA/4.10/data1.csv",sep="|"); df2=read_csv("C:/PA/4.10/data2.csv",sep="|"); df3=read_csv("C:/PA/4.10/data3.csv",sep="|");df=pandas.concat([df1,df2,df3])View Code
字段合并:


from pandas import read_csv;df = read_csv("C:/PA/4.11/data.csv",sep=" ", names=['band', 'area', 'num'] );df = df.astype(str);tel = df['band'] + df['area'] + df['num']View Code
字段匹配:


import pandas; from pandas import read_csv; item=read_csv("C:/PA/4.12/data1.csv",sep="|",names=["id","comments","title"] ); prices=read_csv("C:/PA/4.12/data1.csv",sep="|",names=["id","oldprice","newprice"]) itemprices=pandas.merge(item,prices,left_on="id",right_on="id");View Code
简单计算:

import pandas; from pandas import read_csv;df=read_csv("C:/PA/4.13/data.csv",sep="|"); result=df.price*df.num df["sum"]=resultView Code
数据标准化:

import pandas; from pandas import read_csv;df=read_csv("C:/PA/4.14/data.csv");scale=(df.score-df.score.min())/(df.score.max()-df.score.min())View Code
数据分组:


import pandas; from pandas import read_csv;df = read_csv("C:\\PA\\4.15\\data.csv", sep='|');bins = [min(df.cost)-1, 20, 40, 60, 80, 100, max(df.cost)+1];labels = ['20以下', '20到40', '40到60', '60到80', '80到100', '100以上'];pandas.cut(df.cost, bins)pandas.cut(df.cost, bins, right=False)pandas.cut(df.cost, bins, right=False, labels=labels)View Code
日期转换:

import pandas; from pandas import read_csv; from pandas import to_datetime;df = read_csv("C:\\PA\\4.16\\data.csv",encoding="utf-8"); df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");View Code
日期格式化:

import pandas; from pandas import read_csv; from pandas import to_datetime;df = read_csv("C:\\PA\\4.16\\data.csv",encoding="utf-8"); df_dt=to_datetime(df.注册时间,format="%Y/%m/%d"); df_dt_str=df_dt.apply(lambda x:datatime.strftime(x,"%d-%m-%Y"))View Code
日期抽取:

import pandas; from pandas import read_csv; from pandas import to_datetime;df = read_csv("C:\\PA\\4.18\\data.csv",encoding="utf-8"); df_dt=to_datetime(df.注册时间,format="%Y/%m/%d"); df_dt.dt.year; df_dt.dt.second; df_dt.dt.minute; df_dt.dt.hour; df_dt.dt.day; df_dt.dt.month; df_dt.dt.weekday;View Code
转载于:https://www.cnblogs.com/qiuyuyu/p/9144034.html
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
