我正在尝试执行两个样本 t 检验。我的数据集由 744 行和 186 列组成,我计算了它们的总和和平均值。我需要执行两个样本 t 检验。我的 csv 看起来像这样,我必须从中计算每行的 ttest 和秩和测试,因为单独的行表示单独的 ID 并具有相应的值:
SRA ID ERR169499 ERR169498 ERR169497
Label 1 0 1
TaxID PRJEB3251_ERR169499 PRJEB3251_ERR169499 PRJEB3251_ERR169499
333046 0.05 0.99 99.61
1049 0.03 2.34 34.33
337090 0.01 9.78 23.22
标签 0 和 1 分别用于 case 和 control。到目前为止,我已经这样做了:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import ranksums
def transposer(filename):
file = open(filename, 'rt')
pd.read_csv(file).T.to_csv(str(filename).split("/")
[-1].split(".")[0]+'_transposed.csv',header=False)
pd.read_csv('project.csv').T.to_csv('transposed.csv', header=False)
file = open('transposed.csv', 'rt')
out = open('final_out.csv', 'w')
meta = open('Meta3251.csv', 'rt')
contents = {}
for ids in meta:
contents[ids.split(',')[1]]=ids.split(',')[-1]
count = 0
for row in file:
if count == 0:
out.write('SraID, Label,'+row)
count=1
else:
try:
pid = row.split(',')[0].split('_')[1]
out.write(pid.replace('\n','')+','+contents[pid].replace('\n','')
+','+str(row))
out.flush()
except:
print(pid)
pass
file.close()
out.close()
transposer('final_out.csv')
file1 = open('final_out_transposed.csv','rt')
label = []
data = {}
x = open('final_out_transposed.csv','rt')
for r in x:
datas = r.split(',')
if datas[0] == ' Label':
label.append(r.split(",")[1:])
label = label[0]
label[-1] = label[-1].replace('\n','')
counter = len(label)
for row in file1:
content = row.split(',')
if content[0]=='SraID' or content[0]== 'TaxID' or content[0]==' Label':
pass
else:
dt = row.split(',')
dt[-1] = dt[-1].replace('\n','')
data[dt[0]]=dt[1:]
keys = list(data)
sum_file = open('sum.csv','w')
sum_file.write('TaxId,sum_case,sum_ctrl,case_count,
ctrl_count,case_mean,ctrl_mean,\n')
for key in keys:
sum_case = 0
sum_ctrl = 0
count_case = 0
count_ctrl = 0
mean_case = 0
mean_ctrl = 0
for i in range(counter):
if label[i] == '0':
sum_case=np.float64(sum_case)+np.float64(data[key][i])
count_case = count_case+1
mean_case = sum_case/count_case
else:
sum_ctrl = np.float64(sum_ctrl)+np.float64(data[key][i])
count_ctrl = count_ctrl+1
mean_ctrl = sum_ctrl/count_ctrl
sum_file.write(key+','+str(np.float64((sum_case)))+','
+str(np.float64((sum_ctrl)))+','+str(np.float64((count_case)))
+','+str(np.float64((count_ctrl)))+','+str(np.float64((mean_case)))
+','+str(np.float64((mean_ctrl)))+'\n')
sum_file.flush()
sum_file.close()
df = pd.read_csv('final_out_transposed.csv', header=[1,2], index_col=[0])
case = df.xs('0', axis=1, level=0).dropna()
ctrl = df.xs('1', axis=1, level=0).dropna()
(tt_val, p_ttest) = ttest_ind(case, ctrl, equal_var=False)
print (tt_val)
print (p_ttest)
我收到错误消息:
ValueError: 操作数不能与形状 (92,) (95,) 一起广播
我该如何处理这个错误。我无法更改我的数据。