Python描述性统计示例_数据科学汇集-CSDN博客-免疫在线蚂蚁淘旗下平台-

当前位置：首页 > 新闻动态 >

热卖商品

dfdist/Cobalt/7440-48-4

dfdist/Calcium hydride (CaH)/14452-75-6

新闻详情

Python描述性统计示例_数据科学汇集-CSDN博客

来自 : CSDN技术社区发布时间：2021-03-25

Python描述性统计示例 1 声明

本文的数据来自网络部分代码也有所参照这里做了注释和延伸旨在技术交流如有冒犯之处请联系博主及时处理。

2 描述性统计分析简介

描述性统计分析是指运用制表和分类图形以及计算概括性数据来描述数据特征的一种分析活动。因为本文采用的是Python语言所以这里采用dataframe、pyplot里的方法来实现数据的描述性统计分析。

3 描述下二手房数据

import pandas as pdimport warnings # current version of seaborn generates a bunch of warnings that we ll ignorewarnings.filterwarnings( ignore )import seaborn as snsimport matplotlib.pyplot as pltsns.set(style white , color_codes True)def sndhsVisual(): ##该数据集有区域、房间数、大厅数、面积数、楼层、有无地铁、有无学区、价格等字段 hsdata pd.read_csv( input/sndHsPr.csv ) #print(hsdata.head()) ##print(hsdata.price.quantile([0.25,0.5,0.75,0.4,0.5,0.6,0.7,0.8,0.9,1.0])) ##打印各个段的中位数 ##print(hsdata.price.quantile([0.25,0.5,0.75,1.0])) #定义区映射字段 方便图标展示 dictDist { u chaoyang : 朝阳 , u dongcheng : 东城 , u fengtai : 丰台 , u haidian : 海淀 , u shijingshan : 石景山 , u xicheng : 西城 } #对dist(地区)字段应用APPLY方法替换 hsdata.dist hsdata.dist.apply(lambda x: dictDist[x]) from scipy import stats import numpy as np ###print(type(stats.mode(hsdata.price)[0][0])) ##获取均值、中位数、标准差 print(hsdata.price.agg([ mean , median , std ])) #方法一通过stats.mode获取众数 mode np.array(stats.mode(hsdata.price)[0][0].tolist()) mean_median hsdata.price.agg([ mean , median , std ]).as_matrix() merger np.hstack((mode, mean_median)) ###print(type(merger)) ##类型为ndarray ##获取众数、均值、中位数、标准差 print(pd.Series(merger,index [ mode , mean , median , std ])) print( ############### ) ##方法二通过value_counts下的index.get_level_values获取众数 mode_n hsdata[ price ].value_counts(ascending False).index.get_level_values(0).values[0] print(pd.Series([mode_n],index [ mode ]).append(hsdata.price.agg([ mean , median , std ]))) print( ############### ) ##大于0的 右偏数据 print( 偏度 str(hsdata[ price ].skew())) # 大于0的 右偏数据 ##小于0的 分散的 print( 峰度 str(hsdata[ price ].kurtosis())) #小于0的 分散的 ##print(hsdata.head(n 10)) # 这里可以指定n ##print(hsdata.info()) # 打印数据的信息(列名、数据行、为空记录数、数据类型) ##pd.set_option( display.width , 10) # 150 设置打印宽度 ##pd.set_option( display.max_colwidth , 2) hsdata.price hsdata.price / 10000 # 以万为单位 pd.set_option( display.float_format , lambda x: %.2f % x) #指定数据显示为保留小数点后2位 #print(hsdata.describe(include [np.number])) ## 这里对所有数字类型的查看数据描述 这里分不出连续变量、离散变量 ## 通过describe方法查看变量的统计信息 变量分析-连续性型 print(hsdata[[ area , price ]].describe(include [np.number])) ## 这里看连续型的更有意义 ## 1 变量分析-离散型 #print(hsdata.columns.values) for i in range(hsdata.columns.size): if hsdata.columns.values[i] not in ( area , price ): print( 变量 hsdata.columns.values[i] 频次统计: ) df1 pd.DataFrame(hsdata[hsdata.columns.values[i]].value_counts()).T df1.index [ value_cnt ] print(df1) df hsdata[hsdata.columns.values[i]].agg([ value_counts ]) #注意这里的[],是Series到DataFrame的过程 print(df.T) print( \\n ) else: continue ## 2 变量分析-连续型 plt.rcParams[ font.sans-serif ] [ SimHei ] sns.distplot(hsdata.price,color green ,bins 20,kde True,rug False) #kde表示是否画出一条高斯核密度估计线,这里的密度对应频次,rug True表示rug表示在x轴上每个观测上生成的小细条 plt.xlabel( 房子单价 单位万/平米 ) plt.ylabel( 密度 ) #plt.show() plt.hist(hsdata.price,bins 20) plt.show() ##按照地区的中位数排名并记录为新的DataFrame df_dist_price pd.DataFrame(hsdata.groupby( dist ).median().price.sort_values(ascending True)) ## 打印按照地区中位数升序的排名的索引值 print( 按照地区中位数升序的排名: str(df_dist_price.index.values)) # 方法1,借助dataframe的category类型对指定的list排序 data_tmp hsdata[[ dist , price ]] data_tmp.dist data_tmp.dist.astype( category ) data_tmp.dist.cat.set_categories([ 石景山 , 丰台 , 朝阳 , 海淀 , 东城 , 西城 ], inplace True) ##dat1.dist.cat.set_categories(df_dist_price.index.values, inplace True) 或者用这种方式替换 sns.boxplot(x dist , y price , data data_tmp) ##data_tmp.boxplot(column price ,by dist ) 或者调用DataFrame的boxplot方法 plt.ylabel( 房价单价(万元/平方米) ) plt.xlabel( 城区 ) plt.title( 城区对房价的分组箱线图 ) plt.show() ## 方法2,借助sns.boxplot的order属性 这里的有两个分类变量加一个连续变量时X是其中一个,hue是另外个. data_dist hsdata[[ dist , price ]] sns.boxplot(x dist , y price , data data_dist, order df_dist_price.index.values,hue None) plt.ylabel( 单位面积房价(万元/平方米) ) plt.xlabel( 城区 ) plt.title( 城区对房价的分组箱线图 ) plt.show() #房间数量与价格的描述性统计 data_rownum hsdata[[ roomnum , price ]] df_rownum_price_sort pd.DataFrame(data_rownum.groupby( roomnum ).median().price.sort_values(ascending True)) sns.boxplot(x roomnum ,y price ,data data_rownum,order df_rownum_price_sort.index.values,hue None) plt.ylabel( 单位面积房价(万元/平方米) plt.xlabel( 房子室数 ) plt.title( 房子室数对房价的分组箱线图 ) plt.show() # 厅数与价格的描述性统计 ##print(hsdata.groupby( halls ).halls.agg([ count ])) #print(hsdata[ halls ].value_counts()) #print( 厅最大值: str(hsdata[ halls ].max())) data_halls hsdata[[ halls , price ]] data_halls hsdata[[ halls , price ]] df_halls_price_sort pd.DataFrame(data_halls.groupby( halls ).median().price.sort_values(ascending True)) sns.boxplot(x halls , y price , data data_halls, order df_halls_price_sort.index.values, hue None) plt.ylabel( 单位面积房价(万元/平方米) plt.xlabel( 房子厅数 ) plt.title( 房子厅数对房价的分组箱线图 ) plt.show() print(data_halls.groupby( halls ).median().price.sort_index(ascending False)) data_halls.halls data_halls.halls.astype( category ) data_halls.halls.cat.set_categories([0, 1, 2, 3], inplace True) data_halls.boxplot(column price , by halls ) plt.show() hsdata[[ area , price ]].plot.scatter(x price ,y area ) plt.show() data_floor hsdata[[ floor , price ]] df_floor_price_sort pd.DataFrame(data_floor.groupby( floor ).median().price.sort_values(ascending True)) sns.boxplot(x floor , y price , data data_floor, order df_floor_price_sort.index.values, hue None) plt.ylabel( 单位面积房价(万元/平方米) plt.xlabel( 楼层 ) plt.title( 楼层对房价的分组箱线图 ) plt.show() data_subway hsdata[[ subway , price ]] df_subway_price_sort pd.DataFrame(data_subway.groupby( subway ).median().price.sort_values(ascending True)) sns.boxplot(x subway , y price , data data_subway, order df_subway_price_sort.index.values, hue None) plt.ylabel( 单位面积房价(万元/平方米) plt.xlabel( 地铁 ) plt.title( 地铁对房价的分组箱线图 ) plt.show() data_school hsdata[[ school , price ]] df_school_price_sort pd.DataFrame(data_school.groupby( school ).median().price.sort_values(ascending True)) sns.boxplot(x school , y price , data data_school, order df_school_price_sort.index.values, hue None) plt.ylabel( 单位面积房价(万元/平方米) plt.xlabel( 学校 ) plt.title( 学校对房价的分组箱线图 ) plt.show() ##地区与学区中位数统计 print(hsdata.groupby([ dist , school ]).median().price.sort_index(ascending False).unstack()) hsdata.boxplot(column price , by [ dist , school ], figsize (12, 6)) plt.show()if __name__ __main__ : sndhsVisual()

4 执行结果

D:\\Program Files\\Python37\\python.exe E:/dataVisual/Iris.py
mean 61151.810919
median 57473.000000
std 22293.358147
Name: price, dtype: float64
mode 50000.000000
mean 61151.810919
median 57473.000000
std 22293.358147
dtype: float64
###############
mode 50000.000000
mean 61151.810919
median 57473.000000
std 22293.358147
dtype: float64
###############
偏度0.6794935869486859
峰度-0.019305888544372873
area price
count 16210.00 16210.00
mean 91.75 6.12
std 44.00 2.23
min 30.06 1.83
25% 60.00 4.28
50% 78.83 5.75
75% 110.52 7.61
max 299.00 14.99
变量dist频次统计:
丰台海淀朝阳东城西城石景山
value_counts 2947 2919 2864 2783 2750 1947