多因素分析及Python工具
多因素分析时由于自变量较多导致分析过程复杂,分析过程复杂且容易逻辑混乱,现将主要思路总结如下
主要思路
一、独立性分析
1、分析各自变量直接是否独立,如果独立则分析较为简单,如果不独立则较为困难。
2、分析各自变量和因变量之间是否有关系,无关系则无需分析,减少要分析的自变量。
3、主要方法为各类检验,如卡方检验、F检验、T检验等。以及相关性检验。
二、转换为单变量分析
将其他变量固定,每次单分析一个变量。
三、分析主要变量
对变量中的主要类别进行分析,次要类别先不分析,一般是分析样本量里占比高的类别。
四、聚类分析
将变量聚类,然后按聚好的类,分类分析。聚类可以为
无、其他算法
1、关联分析。
2、机器学习算法。
相关Python 工具
一、数据清洗
1、选择关键列
2、按一定的规则对某些列归类,实际就是多重if 和elif
import pandas as pd
from pandasrw import load,dump# 根据规则填充数据
def fill_data_category(row):if row['无线协议'] == 'Wi-Fi 5' and row['带宽'] == 80:return 'C1'elif row['无线协议'] == 'Wi-Fi 6' and row['频段'] == '2.4G' and row['带宽'] == 20:return 'C2'elif row['无线协议'] == 'Wi-Fi 6' and row['频段'] == '5G' and row['带宽'] == 80:return 'C3'else:return '其他'# 根据规则填充天线数数据
def fill_data_antenna(row):if row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 2:return 2elif row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 4:return 2elif row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 5:return 2elif row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 6:return 3elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 2:return 2elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 4:return 2elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 5:return 3elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 6:return 3else:# 如果条件都不满足,则返回空格 ""。return ""if __name__ == '__main__':path_s = r"D:\data\2023\wifi 测试\WIFI测试跟踪表--20230611-核准-V2.xlsx"path_t = r"D:\data\2023\wifi 测试\归类.csv"df_s = load(path_s,sheetname="测试问题跟踪表")df = df_skey_col=['墙体类别', '测试终端类型及型号', "天线数",'测试软件名称', '路由器索引', '无线协议', '频段', '电平', '带宽','下行', '上行', '测试场景']df = df[key_col]df['类型'] = df.apply(fill_data_category, axis=1)df = df[df['类型'] != "其他"]df['等效天线数'] = df.apply(fill_data_antenna, axis=1)dump(df,path_t)
二、单变量分析
import pandas as pd
from pandasrw import load,dump
from functools import reduce
import time
"""
df:要分析的数据
essential_list:要因所在的列,即因变量,是多个列名的列表
analysis_col:本次要分析的列,即每次分析的列,是列名
target_col:目标列,要汇总的列,是列名"""
def df_filter(df, essential_list, analysis_col, target_col):clst = [i for i in essential_list if i != analysis_col]# 将列表中包含的列的内容作为字符串合并成新列group_coldf["group_col"] = df[clst].astype(str).apply(lambda x: '/'.join(x), axis=1)name_list = df[analysis_col].unique().tolist()result_list = []for name in name_list:data = df[df[analysis_col] == name]# 对新列groupby计算目标列的均值、最大值、最小值和方差result = data.groupby("group_col")[target_col].agg(['mean', 'max', 'min', 'var'])result = result.rename(columns={'mean': f"mean:{name}", 'max': f"max:{name}", 'min': f"min:{name}", 'var': f"var:{name}"})result_list.append(result)
#分列和改变列的顺序pf = reduce(df_merge, result_list)pf.reset_index(inplace=True)df_m = df['group_col'].str.split('/', expand=True)df_m.columns = essential_listdf_t = pd.concat([df_m, df], axis=1)#将"group_col"放到第一列df_t.insert(0, "group_col", df_t.pop("group_col"))return pf,df_t#只返回平均值
def df_filter_mean(df, essential_list, analysis_col, target_col):clst = [i for i in essential_list if i != analysis_col]# 将列表中包含的列的内容作为字符串合并成新列group_coldf["group_col"] = df[clst].astype(str).apply(lambda x: '/'.join(x), axis=1)name_list = df[analysis_col].unique().tolist()result_list = []for name in name_list:data = df[df[analysis_col] == name]# 对新列groupby计算目标列的均值、最大值、最小值和方差result = data.groupby("group_col")[target_col].agg(['mean'])result = result.rename(columns={'mean': f"mean:{name}"})result_list.append(result)# 分列和改变列的顺序pf = reduce(df_merge, result_list)pf.reset_index(inplace=True)df_m = df['group_col'].str.split('/', expand=True)df_m.columns = essential_listdf_t = pd.concat([df_m, df], axis=1)# 将"group_col"放到第一列df_t.insert(0, "group_col", df_t.pop("group_col"))return pf, df_t#合并各类分析的结果,how用outer,取并集
def df_merge(df1, df2):df = pd.merge(df1, df2, on="group_col",how="outer")return dfif __name__ == '__main__':path_s = r"D:\data\2023\wifi 测试\WIFI测试分析表.csv"path_t = r"D:\data\2023\wifi 测试\WIFI测试分析表123.xlsx"#path_t =f"path_t{time.time()}"print(path_t)df = load(path_s, engine="pandas")essential_list=['路由器型号','测试场景']analysis_col = "测试软件名称"target_col = "下行"pf=df_filter(df, essential_list, analysis_col, target_col)[0]dump(pf,path_t,engine="pandas")
三、相关性检验
1、卡方检验
import pandas as pd
import numpy as np
from pandasrw import load,dump
from functools import reduce
import time
from scipy.stats import chi2_contingency#x,y为单列的pandas Series 或 1d numpy array
def chi2_r(x,y):# 将分类变量和连续变量进行交叉制表obs_table = pd.crosstab(x, y)# 进行卡方检验并获取卡方值和p值chi2, p, dof, ex = chi2_contingency(obs_table)return chi2, p
#x,y为单列的pandas Series 或 1d numpy array
def cramers_v(x, y):obs_table = pd.crosstab(x, y)chi2, p, dof, ex = chi2_contingency(obs_table)n = obs_table.sum().sum()phi2 = chi2 / nr, k = obs_table.shapephi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))rcorr = r - ((r-1)**2)/(n-1)kcorr = k - ((k-1)**2)/(n-1)V = np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))return V#X是一个列名列表,每个元素都是列名,与Y列逐个进行卡方计算,Y是列名
def chi2_df(df,X_list,Y):i=0df_c=pd.DataFrame(columns=["列1","列2","chi2","p","v"])y=df[Y]for X in X_list:x=df[X]s=chi2_r(x,y)df_c.at[i,"列1"]=Xdf_c.at[i,"列2"]=Ydf_c.at[i,"chi2"]=s[0]df_c.at[i,"p"]=s[1]df_c.at[i,"v"]=cramers_v(x, y)i+=1return df_cif __name__ == '__main__':path_s = r"D:\data\2023\wifi 测试\WIFI测试分析表.csv"path_t = r"D:\data\2023\wifi 测试\WIFI测试分析表相关性223.xlsx"df_s = load(path_s, engine="pandas")df=df_sdf_c = pd.DataFrame(columns=["列1", "列2", "chi2", "p", "v"])X_list = ['测试软件名称', '路由器型号', '无线协议', '频段', '电平', '带宽', '信道', '下行', '测试场景']Y="下行"df_t=chi2_df(df,X_list,Y)dump(df_t,path_t)
2、F检验 方差分析anova
import pandas as pd
import numpy as np
from pandasrw import load,dump
from functools import reduce
import time
from scipy.stats import chi2_contingency
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import olsdef F_anova(df,category,continuous):formula = f"{continuous} ~{category} "# 运行一元线性回归模型并执行方差分析model = ols(formula=formula, data=df).fit()anova_table = anova_lm(model, typ=2)return anova_tabledef F_df(df,X_list,Y):i=0df_c=pd.DataFrame(columns=["列1","列2","F","p"])for X in X_list:print(X)pf=df[[X,Y]]pf.dropna(inplace=True, how='any')s=F_anova(pf,X,Y)df_c.at[i,"列1"]=Xdf_c.at[i,"列2"]=Ydf_c.at[i,"F"]=s.iloc[0,2]df_c.at[i,"p"]=s.iloc[0,3]i+=1return df_cif __name__ == '__main__':path_s =r"D:\data\2023\wifi 测试\WIFI测试跟踪表V2.csv"path_t = r"D:\data\2023\wifi 测试\WIFI测试分析表相关性323.xlsx"df_s = load(path_s, engine="pandas")df=df_sC=['测试软件名称', '无线协议', '频段', '电平', '带宽', '信道', '测试场景']F = F_df(df, C, "下行")print(F)
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
