时间:2024-09-08 16:36:54 阅读:79
诺维医学科研官网:https://www.newboat.top 更新中! bilibili:文章对应的讲解视频在此。熊大学习社 https://space.bilibili.com/475774512 公众号|B站|全网同名:熊大学习社 医学资源站,https://med.newboat.top/ 内有医学离线数据库、数据提取、科研神器等高质量资料库 诺维AI:https://gpt4.nwzz.xyz 可用GPT4|GPT3.5,改写论文、翻译润色、编写代码的好助手 课程说明: (1)连续变量和分类变量的数据检验-代码实例已全部公开。关注公众号熊大学习社,回复 (2)医学公共数据数据库学习训练营已开班,以及数据提取和数据分析定制,具体扫码咨询课程助理。 (3)关注熊大学习社。您的一键三连是我最大的动力。 (1)连续变量和分类变量的数据检验-代码实例已全部公开。关注公众号熊大学习社,回复 (2)医学公共数据数据库学习训练营已开班,欢迎咨询课程助理! (3)数据提取和数据分析定制,具体扫码咨询课程助理。医学数据分析技能点05 连续变量和分类变量的数据检验
med005
,获取资料信息。代码实例
#####公众号:熊大学习社#####
# 手动设置工作目录为代码和数据所在文件夹
# 步骤方法:点菜单栏“session”->"Set Work Directory"->"Choose Directory"
# 选择代码和数据所在文件夹即可
# 查看工作目录
getwd()
# 检测是否安装了相关的库,没有则自动安装
if(!require("data.table")) install.packages("data.table")
if(!require("tidyverse")) install.packages("tidyverse")
if(!require('magrittr')) install.packages('magrittr')
if(!require('readxl')) install.packages('readxl')
if(!require('dplyr')) install.packages('dplyr')
if(!require('mice')) install.packages('mice')
if(!require("rstatix")) install.packages("rstatix")
if(!require("nortest")) install.packages("nortest")
# 加载库
library(data.table)
library(tidyverse) # 读取文件fread
library(magrittr)
library(rstatix)
library(readxl) # 数据文件读取与写入
library(dplyr) # 数据处理
library(mice) # 缺失值插补
library(nortest)
# 数据准备-----------
# 代码目录,对应修改
code_path <- "E:医学AI自媒体 医学数据分析技能点医学数据分析技能点(05)连续变量和分类变量的数据检验"
# 设置代码目录
setwd(code_path)
getwd()
# 读取数据
data_psm <- read.csv('data_psm.csv')
data_psm <- data_psm %>% mutate( race_imputed = as.factor(race_imputed),
gender = as.factor(gender),
race_imputed = factor(race_imputed, levels = c("White", "Black", "Asian", "Hispanic", "Others")),
co_diabetes = as.factor(co_diabetes),
co_hypertension = as.factor(co_hypertension),
co_neoplasm = as.factor(co_neoplasm),
co_COPD = as.factor(co_COPD),
co_CA_surgery = as.factor(co_CA_surgery),
co_VTE = as.factor(co_VTE),
co_CI = as.factor(co_CI),
co_GI = as.factor(co_GI),
co_ICH = as.factor(co_ICH),
co_bleeding = as.factor(co_bleeding),
group = as.factor(group)
)
# 查看数据
head(data_psm)
## 4.3 【自学】连续变量检验-------
# 主要是正态分布检验 normality
# 连续变量,即数字类型的列名
numeric_var <- data_psm %>%
select_if(is.numeric) %>%
select(-c(hadm_id, status_30, status_90, surv_30, surv_90)) %>%
names()
### 4.3.1 Anderson–Darling AD检验-----------
ad.test.multi <- function(x) { ad.test(x) %>% broom::tidy() }
# AD检验结果
normality <- map_df(data_psm[numeric_var], ad.test.multi) %>%
# 增加一行,变量名
bind_cols(variable = numeric_var) %>%
# 选取指定列并改名
select(variable, statistic, p_norm='p.value')
### 4.3.2 方差检验variance test----------------
variance <- map_df(data_psm[numeric_var], function(x) {levene_test(data_psm, x ~ data_psm$group)}) %>%
select(p_vari = p) %>%
cbind(variable = numeric_var)
### 4.3.3 t检验 t test------------
t_test_res <- data_psm %>%
# 从PSM样本中选择数值变量和分组变量
select(all_of(numeric_var), group) %>%
# 横表变竖表
pivot_longer(
cols = all_of(numeric_var),
names_to = "variable"
) %>%
group_by(variable) %>%
# t检验,比较不同组之间的差异
t_test(value ~ group, var.equal = TRUE, paired = TRUE) %>%
# 选取指定列及改名
select(variable, p_t_test = p)
### 4.3.4 wilcox秩和检验 wilcox test---------
wilcox_test_res <- data_psm %>%
# 从PSM样本中选择数值变量和分组变量
select(all_of(numeric_var), group) %>%
# 横表变竖表
pivot_longer(
cols = all_of(numeric_var),
names_to = "variable"
) %>%
group_by(variable) %>%
# t检验,比较不同组之间的差异
wilcox_test(value ~ group, paired = TRUE) %>%
# 选取指定列及改名
select(variable, p_wilcox_test = p)
### 4.3.5 连续变量检验汇总------------
p_numb <- normality %>%
left_join(variance, by = "variable") %>%
left_join(t_test_res, by = "variable") %>%
left_join(wilcox_test_res, by = "variable") %>%
group_by(variable) %>%
mutate(p_final = if_else(p_norm >= 0.05 & p_vari >= 0.05, p_t_test, p_wilcox_test)) %>%
add_significance(p.col = "p_final")
## 4.4 【自学】分类变量检验---------
# 分类变量
nominal_var <- data_psm %>%
select_if(is.factor) %>%
select(-c(group)) %>%
names()
### 4.4.1 卡方独立性检验 chisq test -----------
p_chisq <- data_psm %>%
select(all_of(nominal_var), group)%>%
# # 横表变竖表
pivot_longer(
cols = -group,
names_to = "variable"
) %>%
group_by(variable) %>%
# 卡方独立性检验 chisq test
do(chisq_test(.$group, .$value)) %>%
select(variable, p_chisq = p)
### 4.4.2 fisher检验------------
# 统计样本数据的频次和概率
freq <- freq_table(data_psm, group, all_of(nominal_var))
fisher_map <- function(x) {
temp <- freq_table(data_psm, group, x) %>%
# 删除prop列
select(-prop) %>%
# 横表变竖表
pivot_wider(
names_from = "group",
values_from = "n"
) %>%
# 空置设为0
replace(is.na(.), 0)
p_fisher <- temp %>%
select(all_of(unique(data_psm$group))) %>%
fisher_test(simulate.p.value = TRUE)
min_n <- temp %>%
pivot_longer(
cols = all_of(unique(data_psm$group))
) %>%
arrange(value) %>%
slice(1) %>%
select(value) %>%
cbind(variable = x)
cbind(p_fisher, min_n)
}
p_fisher <- map_df(all_of(nominal_var), fisher_map) %>%
select(p_fisher = p, min_n = value, variable)
### 4.4.3 分类变量检验汇总------------
p_norm <- tibble(variable = all_of(nominal_var)) %>%
left_join(p_chisq, by = "variable") %>%
left_join(p_fisher, by = "variable") %>%
group_by(variable) %>%
mutate(p_final = ifelse(nrow(data_psm) > 40 & min_n >= 5, p_chisq, p_fisher))
### 4.4.4 P值汇总---------
# 连续变量p值
p_numb
# 分类变量p值
p_norm
# 汇总
p_value <- bind_rows(
p_numb %>%
select(variable, p_final),
p_norm %>%
select(variable, p_final)
)
## 4.5 【自学】基线分析:tbl_summary-------
theme_gtsummary_language("en", big.mark = "")
psm_summary <- data_psm %>%
# 基线分析
tbl_summary(
by = group,
statistic = list(
all_continuous2() ~ c("{mean} u00B1 {sd}", "{median} ({p25}, {p75})", "{p_miss}"),
all_categorical() ~ "{n} ({p}%)"
),
digits = list(
all_continuous() ~ 1,
all_categorical() ~ 1
)
) %>%
# 添加p值
add_p()
psm_summary %>%
as_tibble() %>%
rename("variable" = `**Characteristic**`) %>%
left_join(p_value) %>%
mutate(p_final = ifelse(p_final >= 0.001, round(p_final, 3), "< 0.001")) %>%
write_csv(file = "基线表和P值.csv")小结
med005
,获取资料信息。