1.

set up numeric vector and counter

counter <- 0
vector <- as.integer(runif(100, max=5, min=-5))

function to count zeros in numeric vector

count_zeros <- function(x) {
  for (i in 1:length(x)) {
    if (x[i] == 0) {
      counter <- counter + 1}
  }
  return(counter)
}

run code

count_zeros(x=vector)

## [1] 22

2.

subsetting instead of loop

counter <- length(which(vector==0))
print(counter)

## [1] 22

3.

creating a specified matrix

FUNCTION: create_matrix

purpose: create matrix with x*y dimensions in which each element is the product of the row number x the column number y

input: x, y

output: matrix

create_matrix <- function(x,y) {
  mat <- matrix(sample(1:10, size = x*y),
              nrow=x,
              ncol=y)
  for(i in 1:x) {
    for(j in 1:y) {
      (mat[i,j] <- i*j)
    }
  }
  return(mat)
}

create_matrix(2,2)

##      [,1] [,2]
## [1,]    1    2
## [2,]    2    4

4.

a.

group1 <- data.frame(rnorm(50, mean=7, sd=1))
colnames(group1) <- "group1"
head(group1)

##     group1
## 1 7.174038
## 2 6.254457
## 3 7.716757
## 4 7.273712
## 5 7.802367
## 6 6.135106

group2 <- data.frame(rnorm(50, mean=17, sd=1))
colnames(group2) <- "group2"
head(group2)

##     group2
## 1 16.81634
## 2 15.60387
## 3 17.08514
## 4 15.20910
## 5 18.00867
## 6 15.65002

group3 <- data.frame(rnorm(50, mean=27, sd=1))
colnames(group3) <- "group3"
head(group3)

##     group3
## 1 26.69755
## 2 26.46541
## 3 26.62676
## 4 28.10484
## 5 26.88139
## 6 27.51957

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

BindGroups <- cbind(group1, group2, group3)
head(BindGroups)

##     group1   group2   group3
## 1 7.174038 16.81634 26.69755
## 2 6.254457 15.60387 26.46541
## 3 7.716757 17.08514 26.62676
## 4 7.273712 15.20910 28.10484
## 5 7.802367 18.00867 26.88139
## 6 6.135106 15.65002 27.51957

BindGroups <- pivot_longer(BindGroups, cols=group1:group3, names_to = "group")
head(BindGroups)

## # A tibble: 6 × 2
##   group  value
##   <chr>  <dbl>
## 1 group1  7.17
## 2 group2 16.8 
## 3 group3 26.7 
## 4 group1  6.25
## 5 group2 15.6 
## 6 group3 26.5

b.

FUNCTION: shuffle_and_mean

purpose: shuffle values in dataframe and return means of each group

input: dataframe with groups and values

output: dataframe of means of shuffled groups

shuffle_and_mean <- function(x) {
  Shuffle <- transform(x, shuffled=sample(value))
  group1 <- mean(Shuffle$shuffled[Shuffle$group=="group1"])
  group2 <- mean(Shuffle$shuffled[Shuffle$group=="group2"])
  group3 <- mean(Shuffle$shuffled[Shuffle$group=="group3"])
  means <- cbind(group1,group2,group3)
  return(means)
}

shuffle_and_mean(BindGroups)

##        group1   group2   group3
## [1,] 17.83731 15.25844 18.00835

c.

df <- data.frame()
replicate <- c(1:100)

for(i in 1:100) {
  means <- shuffle_and_mean(BindGroups)
  df <- rbind(df,means)
}

df <- cbind(replicate, df)

head(df)

##   replicate   group1   group2   group3
## 1         1 17.21124 15.92943 17.96343
## 2         2 16.08256 18.86818 16.15336
## 3         3 17.65616 15.91757 17.53037
## 4         4 18.12966 15.86927 17.10517
## 5         5 16.91912 16.89228 17.29270
## 6         6 15.97956 17.82542 17.29912

d.

library(ggplot2)
library(ggpubr)

p1<-ggplot(df, aes(x=group1)) + 
  geom_histogram(color="black", fill="white")

p2<-ggplot(df, aes(x=group2)) + 
  geom_histogram(color="black", fill="white")

p3<-ggplot(df, aes(x=group3)) + 
  geom_histogram(color="black", fill="white")

all_graphs <- ggarrange(p1, p2, p3,
                        nrow = 1, ncol = 3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

all_graphs

get_metric <- function(z=NULL) {
                if(is.null(z)){
                  x_obs <- 1:20
                  y_obs <-  x_obs + 10*rnorm(20)
                  z <- data.frame(x_obs,y_obs)} # set up data frame                 
. <- lm(z[,2]~z[,1])
. <- summary(.)
. <- .$coefficients[2,1]

slope <- .
return(slope)
}

read_data <- function(z=NULL) {
  
                if(is.null(z)){
                  x_obs <- 1:20
                  y_obs <- x_obs + 10*rnorm(20)
                  df <- data.frame(x_obs,
                                   y_obs)} else { 
# set up data frame                 
                  df <-read.table(file=z,
                                   header=TRUE,
                                   sep=",")}

return(df)
}

get_pval <- function(z=NULL) {
                    if(is.null(z)){
                      z <- list(x_obs=runif(1000),x_sim=runif(1000))}
                      p_lower <- mean(z[[2]]<=z[[1]])
                      p_upper <- mean(z[[2]]>=z[[1]])
return(c(pl=p_lower,pu=p_upper))
}

extra p-value histogram

trt_group <- c(rep("Control",4),
                rep("Treatment",5))
print(trt_group)

## [1] "Control"   "Control"   "Control"   "Control"   "Treatment" "Treatment"
## [7] "Treatment" "Treatment" "Treatment"

# create response var
z <- c(runif(4) + 1, runif(5) + 10)
print(z)

## [1]  1.353110  1.268682  1.771476  1.356823 10.353421 10.863466 10.202719
## [8] 10.283426 10.463244

# combine into data frame
df <- data.frame(trt=trt_group,res=z)
print(df)

##         trt       res
## 1   Control  1.353110
## 2   Control  1.268682
## 3   Control  1.771476
## 4   Control  1.356823
## 5 Treatment 10.353421
## 6 Treatment 10.863466
## 7 Treatment 10.202719
## 8 Treatment 10.283426
## 9 Treatment 10.463244

# look at means in observed data
obs <- tapply(df$res,df$trt,mean)
print(obs)

##   Control Treatment 
##  1.437523 10.433255

df_sim <- df

# randomization

df_sim$res <- sample(df_sim$res)
print(df_sim)

##         trt       res
## 1   Control 10.202719
## 2   Control 10.863466
## 3   Control 10.353421
## 4   Control 10.283426
## 5 Treatment  1.771476
## 6 Treatment 10.463244
## 7 Treatment  1.353110
## 8 Treatment  1.356823
## 9 Treatment  1.268682

sim <- tapply(df_sim$res,df$trt,mean)
print(sim)

##   Control Treatment 
## 10.425758  3.242667

################################
# FUNCTION: read_data
# purpose: read in or generate data set
# input: file name
# output:  3 column data frame of observed data (x,y)
#-------------------- 
read_data <- function(steep=1, z=NULL) {
  
  if(is.null(z)){
    x_obs <- 1:20
    y_obs <- steep*x_obs + 10*rnorm(20)
    df <- data.frame(x_obs,
                     y_obs)} else {
                       df <- read.table(file=z,
                                        header=TRUE,
                                        sep=",")}
  return(df)

}

################################
# FUNCTION: get_metric
# purpose: 
# input: 
# output:  
#-------------------- 
get_metric <- function(z=NULL) {
  if(is.null(z)){
    x_obs <- 1:20
    y_obs <-  x_obs + 10*rnorm(20)
    z <- data.frame(x_obs,y_obs)} # set up data frame                 
  . <- lm(z[,2]~z[,1])
  . <- summary(.)
  . <- .$coefficients[2,1]
  
  slope <- .
  return(slope)
}

##################################################
# function: shuffle_data
# randomize data for regression analysis
# input: 2-column data frame (x_var,y_var)
# output: 2-column data frame (x_var,y_var)
#------------------------------------------------- 
shuffle_data <- function(z=NULL) {
  if(is.null(z)){
    x_obs <- 1:20
    y_obs <- x_obs + 3*rnorm(20)
    z <- data.frame(x_obs,y_obs)} # set up data frame                 
  z[,2] <- sample(z[,2]) # use sample function with defaults to reshuffle column
  
  return(z)
}

##################################################
# function: get_pval
# calculate p value from simulation
# input: list of observed metric, and vector of simulated metrics
# output: lower, upper tail probability values
#------------------------------------------------- 
get_pval <- function(z=NULL) {
  if(is.null(z)){
    z <- list(x_obs=runif(1),x_sim=runif(1000))}
  p_lower <- mean(z[[2]]<=z[[1]])
  p_upper <- mean(z[[2]]>=z[[1]])
  return(c(pl=p_lower,pu=p_upper))
}

##################################################
# function: plot_ran_test
# create ggplot of histogram of simulated values
# input: list of observed metric and vector of simulated metrics
# output: saved ggplot graph
#------------------------------------------------- 
plot_ran_test <- function(z=NULL) {
  if(is.null(z)){
    z <- list(rnorm(1),rnorm(1000)) }
  df <- data.frame(ID=seq_along(z[[2]]),sim_x=z[[2]])
  
  p1 <- ggplot(data=df) + 
    aes(x=sim_x)
  
  p1 + geom_histogram(aes(fill=I("goldenrod"),
                          color=I("black"))) +
    geom_vline(aes(xintercept=z[[1]],
                   col="blue")) 
  
}
library(ggplot2)
# global variables
n_sim <- 1000
x_sim <- rep(NA,n_sim)
# 
df <- read_data(steep=2)
x_obs <- get_metric(df)

for (i in seq_len(n_sim)) {
  x_sim[i] <- get_metric(shuffle_data(df))
}

slopes <- list(x_obs,x_sim)
get_pval(slopes)

## pl pu 
##  1  0

plot_ran_test(slopes)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Homework_10

Greta Savitsky

3/30/2022

1.

set up numeric vector and counter

function to count zeros in numeric vector

run code

2.

subsetting instead of loop

3.

creating a specified matrix

FUNCTION: create_matrix

purpose: create matrix with x*y dimensions in which each element is the product of the row number x the column number y

input: x, y

output: matrix

4.

a.

b.

FUNCTION: shuffle_and_mean

purpose: shuffle values in dataframe and return means of each group

input: dataframe with groups and values

output: dataframe of means of shuffled groups

c.

d.

extra p-value histogram