counter <- 0
vector <- as.integer(runif(100, max=5, min=-5))
count_zeros <- function(x) {
for (i in 1:length(x)) {
if (x[i] == 0) {
counter <- counter + 1}
}
return(counter)
}
count_zeros(x=vector)
## [1] 22
counter <- length(which(vector==0))
print(counter)
## [1] 22
create_matrix <- function(x,y) {
mat <- matrix(sample(1:10, size = x*y),
nrow=x,
ncol=y)
for(i in 1:x) {
for(j in 1:y) {
(mat[i,j] <- i*j)
}
}
return(mat)
}
create_matrix(2,2)
## [,1] [,2]
## [1,] 1 2
## [2,] 2 4
group1 <- data.frame(rnorm(50, mean=7, sd=1))
colnames(group1) <- "group1"
head(group1)
## group1
## 1 7.174038
## 2 6.254457
## 3 7.716757
## 4 7.273712
## 5 7.802367
## 6 6.135106
group2 <- data.frame(rnorm(50, mean=17, sd=1))
colnames(group2) <- "group2"
head(group2)
## group2
## 1 16.81634
## 2 15.60387
## 3 17.08514
## 4 15.20910
## 5 18.00867
## 6 15.65002
group3 <- data.frame(rnorm(50, mean=27, sd=1))
colnames(group3) <- "group3"
head(group3)
## group3
## 1 26.69755
## 2 26.46541
## 3 26.62676
## 4 28.10484
## 5 26.88139
## 6 27.51957
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
BindGroups <- cbind(group1, group2, group3)
head(BindGroups)
## group1 group2 group3
## 1 7.174038 16.81634 26.69755
## 2 6.254457 15.60387 26.46541
## 3 7.716757 17.08514 26.62676
## 4 7.273712 15.20910 28.10484
## 5 7.802367 18.00867 26.88139
## 6 6.135106 15.65002 27.51957
BindGroups <- pivot_longer(BindGroups, cols=group1:group3, names_to = "group")
head(BindGroups)
## # A tibble: 6 × 2
## group value
## <chr> <dbl>
## 1 group1 7.17
## 2 group2 16.8
## 3 group3 26.7
## 4 group1 6.25
## 5 group2 15.6
## 6 group3 26.5
shuffle_and_mean <- function(x) {
Shuffle <- transform(x, shuffled=sample(value))
group1 <- mean(Shuffle$shuffled[Shuffle$group=="group1"])
group2 <- mean(Shuffle$shuffled[Shuffle$group=="group2"])
group3 <- mean(Shuffle$shuffled[Shuffle$group=="group3"])
means <- cbind(group1,group2,group3)
return(means)
}
shuffle_and_mean(BindGroups)
## group1 group2 group3
## [1,] 17.83731 15.25844 18.00835
df <- data.frame()
replicate <- c(1:100)
for(i in 1:100) {
means <- shuffle_and_mean(BindGroups)
df <- rbind(df,means)
}
df <- cbind(replicate, df)
head(df)
## replicate group1 group2 group3
## 1 1 17.21124 15.92943 17.96343
## 2 2 16.08256 18.86818 16.15336
## 3 3 17.65616 15.91757 17.53037
## 4 4 18.12966 15.86927 17.10517
## 5 5 16.91912 16.89228 17.29270
## 6 6 15.97956 17.82542 17.29912
library(ggplot2)
library(ggpubr)
p1<-ggplot(df, aes(x=group1)) +
geom_histogram(color="black", fill="white")
p2<-ggplot(df, aes(x=group2)) +
geom_histogram(color="black", fill="white")
p3<-ggplot(df, aes(x=group3)) +
geom_histogram(color="black", fill="white")
all_graphs <- ggarrange(p1, p2, p3,
nrow = 1, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
all_graphs
get_metric <- function(z=NULL) {
if(is.null(z)){
x_obs <- 1:20
y_obs <- x_obs + 10*rnorm(20)
z <- data.frame(x_obs,y_obs)} # set up data frame
. <- lm(z[,2]~z[,1])
. <- summary(.)
. <- .$coefficients[2,1]
slope <- .
return(slope)
}
read_data <- function(z=NULL) {
if(is.null(z)){
x_obs <- 1:20
y_obs <- x_obs + 10*rnorm(20)
df <- data.frame(x_obs,
y_obs)} else {
# set up data frame
df <-read.table(file=z,
header=TRUE,
sep=",")}
return(df)
}
get_pval <- function(z=NULL) {
if(is.null(z)){
z <- list(x_obs=runif(1000),x_sim=runif(1000))}
p_lower <- mean(z[[2]]<=z[[1]])
p_upper <- mean(z[[2]]>=z[[1]])
return(c(pl=p_lower,pu=p_upper))
}
trt_group <- c(rep("Control",4),
rep("Treatment",5))
print(trt_group)
## [1] "Control" "Control" "Control" "Control" "Treatment" "Treatment"
## [7] "Treatment" "Treatment" "Treatment"
# create response var
z <- c(runif(4) + 1, runif(5) + 10)
print(z)
## [1] 1.353110 1.268682 1.771476 1.356823 10.353421 10.863466 10.202719
## [8] 10.283426 10.463244
# combine into data frame
df <- data.frame(trt=trt_group,res=z)
print(df)
## trt res
## 1 Control 1.353110
## 2 Control 1.268682
## 3 Control 1.771476
## 4 Control 1.356823
## 5 Treatment 10.353421
## 6 Treatment 10.863466
## 7 Treatment 10.202719
## 8 Treatment 10.283426
## 9 Treatment 10.463244
# look at means in observed data
obs <- tapply(df$res,df$trt,mean)
print(obs)
## Control Treatment
## 1.437523 10.433255
df_sim <- df
# randomization
df_sim$res <- sample(df_sim$res)
print(df_sim)
## trt res
## 1 Control 10.202719
## 2 Control 10.863466
## 3 Control 10.353421
## 4 Control 10.283426
## 5 Treatment 1.771476
## 6 Treatment 10.463244
## 7 Treatment 1.353110
## 8 Treatment 1.356823
## 9 Treatment 1.268682
sim <- tapply(df_sim$res,df$trt,mean)
print(sim)
## Control Treatment
## 10.425758 3.242667
################################
# FUNCTION: read_data
# purpose: read in or generate data set
# input: file name
# output: 3 column data frame of observed data (x,y)
#--------------------
read_data <- function(steep=1, z=NULL) {
if(is.null(z)){
x_obs <- 1:20
y_obs <- steep*x_obs + 10*rnorm(20)
df <- data.frame(x_obs,
y_obs)} else {
df <- read.table(file=z,
header=TRUE,
sep=",")}
return(df)
}
################################
# FUNCTION: get_metric
# purpose:
# input:
# output:
#--------------------
get_metric <- function(z=NULL) {
if(is.null(z)){
x_obs <- 1:20
y_obs <- x_obs + 10*rnorm(20)
z <- data.frame(x_obs,y_obs)} # set up data frame
. <- lm(z[,2]~z[,1])
. <- summary(.)
. <- .$coefficients[2,1]
slope <- .
return(slope)
}
##################################################
# function: shuffle_data
# randomize data for regression analysis
# input: 2-column data frame (x_var,y_var)
# output: 2-column data frame (x_var,y_var)
#-------------------------------------------------
shuffle_data <- function(z=NULL) {
if(is.null(z)){
x_obs <- 1:20
y_obs <- x_obs + 3*rnorm(20)
z <- data.frame(x_obs,y_obs)} # set up data frame
z[,2] <- sample(z[,2]) # use sample function with defaults to reshuffle column
return(z)
}
##################################################
# function: get_pval
# calculate p value from simulation
# input: list of observed metric, and vector of simulated metrics
# output: lower, upper tail probability values
#-------------------------------------------------
get_pval <- function(z=NULL) {
if(is.null(z)){
z <- list(x_obs=runif(1),x_sim=runif(1000))}
p_lower <- mean(z[[2]]<=z[[1]])
p_upper <- mean(z[[2]]>=z[[1]])
return(c(pl=p_lower,pu=p_upper))
}
##################################################
# function: plot_ran_test
# create ggplot of histogram of simulated values
# input: list of observed metric and vector of simulated metrics
# output: saved ggplot graph
#-------------------------------------------------
plot_ran_test <- function(z=NULL) {
if(is.null(z)){
z <- list(rnorm(1),rnorm(1000)) }
df <- data.frame(ID=seq_along(z[[2]]),sim_x=z[[2]])
p1 <- ggplot(data=df) +
aes(x=sim_x)
p1 + geom_histogram(aes(fill=I("goldenrod"),
color=I("black"))) +
geom_vline(aes(xintercept=z[[1]],
col="blue"))
}
library(ggplot2)
# global variables
n_sim <- 1000
x_sim <- rep(NA,n_sim)
#
df <- read_data(steep=2)
x_obs <- get_metric(df)
for (i in seq_len(n_sim)) {
x_sim[i] <- get_metric(shuffle_data(df))
}
slopes <- list(x_obs,x_sim)
get_pval(slopes)
## pl pu
## 1 0
plot_ran_test(slopes)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.