ValueError:使用两个样本独立 t 检验时,操作数无法与形状一起广播

数据挖掘 Python 统计数据 scipy
2022-03-07 15:29:49

我正在尝试执行两个样本 t 检验。我的数据集由 744 行和 186 列组成,我计算了它们的总和和平均值。我需要执行两个样本 t 检验。我的 csv 看起来像这样,我必须从中计算每行的 ttest 和秩和测试,因为单独的行表示单独的 ID 并具有相应的值:

SRA ID  ERR169499            ERR169498           ERR169497
Label   1                    0                   1
TaxID   PRJEB3251_ERR169499  PRJEB3251_ERR169499 PRJEB3251_ERR169499
333046  0.05                 0.99                99.61
1049    0.03                 2.34                34.33
337090  0.01                 9.78                23.22

标签 0 和 1 分别用于 case 和 control。到目前为止,我已经这样做了:

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import ranksums

def transposer(filename):
file = open(filename, 'rt')
pd.read_csv(file).T.to_csv(str(filename).split("/")
[-1].split(".")[0]+'_transposed.csv',header=False)


pd.read_csv('project.csv').T.to_csv('transposed.csv', header=False)

file = open('transposed.csv', 'rt')
out = open('final_out.csv', 'w')
meta = open('Meta3251.csv', 'rt')
contents = {}
for ids in meta:
    contents[ids.split(',')[1]]=ids.split(',')[-1]
count = 0
for row in file:
    if count == 0:
    out.write('SraID, Label,'+row)
    count=1
else:

    try:
        pid = row.split(',')[0].split('_')[1]
out.write(pid.replace('\n','')+','+contents[pid].replace('\n','')
+','+str(row))
        out.flush()
    except:
        print(pid)
        pass
file.close()
out.close()
transposer('final_out.csv')
file1 = open('final_out_transposed.csv','rt')
label = []
data = {}

x = open('final_out_transposed.csv','rt')
for r in x:
    datas = r.split(',')
    if datas[0] == ' Label':
        label.append(r.split(",")[1:])
label = label[0]
label[-1] = label[-1].replace('\n','')
counter = len(label)
for row in file1:
    content = row.split(',')
if content[0]=='SraID' or content[0]== 'TaxID' or content[0]==' Label':
    pass
else:
    dt = row.split(',')
    dt[-1] = dt[-1].replace('\n','')

    data[dt[0]]=dt[1:]
keys = list(data)
sum_file = open('sum.csv','w')
sum_file.write('TaxId,sum_case,sum_ctrl,case_count,
ctrl_count,case_mean,ctrl_mean,\n')
for key in keys:
    sum_case = 0
    sum_ctrl = 0
    count_case = 0
    count_ctrl = 0
    mean_case = 0
    mean_ctrl = 0
for i in range(counter):
    if label[i] == '0':
        sum_case=np.float64(sum_case)+np.float64(data[key][i])
        count_case = count_case+1
        mean_case = sum_case/count_case
    else:
        sum_ctrl = np.float64(sum_ctrl)+np.float64(data[key][i])
        count_ctrl = count_ctrl+1
        mean_ctrl = sum_ctrl/count_ctrl
sum_file.write(key+','+str(np.float64((sum_case)))+','

+str(np.float64((sum_ctrl)))+','+str(np.float64((count_case)))        
+','+str(np.float64((count_ctrl)))+','+str(np.float64((mean_case)))
+','+str(np.float64((mean_ctrl)))+'\n')
sum_file.flush()
sum_file.close()

df  = pd.read_csv('final_out_transposed.csv', header=[1,2], index_col=[0])
case = df.xs('0', axis=1, level=0).dropna()
ctrl = df.xs('1', axis=1, level=0).dropna()
(tt_val, p_ttest) = ttest_ind(case, ctrl, equal_var=False)
print (tt_val)
print (p_ttest)

我收到错误消息:

ValueError: 操作数不能与形状 (92,) (95,) 一起广播

我该如何处理这个错误。我无法更改我的数据。

1个回答

这个问题的答案会是:用xs方法创建的对象Pandas DataFrame看起来像two-dimensional arrays这些必须是传递给时flattened的样子Pandas 对象的 values 属性给出了一个,并且该方法将数组展平为一维。它会像:one-dimensional arraysttest_indnumpy arrayravel()

df  = pd.read_csv('final_out_transposed.csv', header=[1,2], index_col=[0])
case = df.xs('0', axis=1, level=0).dropna()
ctrl = df.xs('1', axis=1, level=0).dropna()
(tt_val,p_ttest ) = ttest_ind(case.values.ravel(), ctrl.values.ravel(), 
equal_var=False)
print (tt_val)
print(p_ttest)