Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# --------------------------
# Test Preparation Solutions
# (with feedback comments)
# R script
# --------------------------
# ----------
# Question 1
# ----------
# (code provided)
install.packages("ISwR")
library(tidyverse)
library(ISwR)
thuesen = as_tibble(thuesen)
model = lm(short.velocity~blood.glucose, data=thuesen)
summary(model)
# (a) Look at the output
#
# Call:
# lm(formula = short.velocity ~ blood.glucose, data = thuesen)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.40141 -0.14760 -0.02202 0.03001 0.43490
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.09781 0.11748 9.345 6.26e-09 ***
# blood.glucose 0.02196 0.01045 2.101 0.0479 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.2167 on 21 degrees of freedom
# (1 observation deleted due to missingness)
# Multiple R-squared: 0.1737, Adjusted R-squared: 0.1343
# F-statistic: 4.414 on 1 and 21 DF, p-value: 0.0479
#
# * By looking at the Coefficients section of the output
# above, the line of best fit is:
# short.velocity = 1.09781 + 0.02196 * blood.glucose
# * The p-value corresponding to the blood.glucose
# coefficient, i.e., Pr(>|t|) in the Coefficients
# section of the output above, is 0.0479 < 0.05
# so we say that "blood.glucose is a significant
# predictor of short.velocity", i.e., we would
# definitely keep it in the model.
# (b) Scatterplot code
ggplot(thuesen, aes(x=blood.glucose,y=short.velocity)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
# thuesen is the dataset
# aes() sets the x-axis to be blood.glucose
# and the y-axis to be short.velocity
# geom_point() draws the scatterplot of dots
# geom_smooth() draws the line of best fit
# (the linear model or "lm")
# (c) Mean of the residuals
# ... is ALWAYS zero
# ... OR modify the code provided
library(broom)
mean(pull(augment(model),.resid))
# with output:
# [1] -3.613222e-18
# ... which is scientific notation for
# 3.613222 with the decimal place shifted 18 places to the left
# i.e. 0.000000000000000003613222
# Note that summary(model) gives the median and
# quartiles of the residuals but not mean
# (d) Diagnostic plots (code provided)
library(ggfortify)
autoplot(model)
# Residuals vs Fitted values: ok, blue line approx horizontal
# no nonlinear curve or funneling
# Normal Q-Q: the upper right points are a concern as they do not
# lie on the dashed line
# Scale-Location: ok
# Residuals vs Leverage: some outliers
# (e) 95% confidence interval for correlation coefficient
# need to use cor.test() not just cor()
cor.test(thuesen$blood.glucose,thuesen$short.velocity)
# or
cor.test(pull(thuesen,blood.glucose),pull(thuesen,short.velocity))
# or
x = thuesen$blood.glucose
y = thuesen$short.velocity
cor(x,y) # gives NA due to some missing values
cor.test(x,y)
# or
x = pull(thuesen,blood.glucose)
y = pull(thuesen,short.velocity)
cor(x,y) # gives NA due to some missing values
cor.test(x,y)
# In each case cor.test() gives
# 95 percent confidence interval:
# 0.005496682 0.707429479
# This confidence interval DOES NOT include zero, so we can conclude
# that in the population blood.glucose and short.velocity are
# positively correlated
# (but very unsure whether weak/moderate/strong)
# ----------
# Question 2
# ----------
# (code provided)
library(tidyverse)
msleep
# (a) Barchart
ggplot(msleep,aes(x=vore)) +
geom_bar()
# Herbivore (herbi) is the most frequent vore
# msleep is the dataset
# aes() sets the x-axis to be vore
# and there is NO y-axis (this will be
# the counts)
# geom_bar() builds a frequency table and
# draws the bars
# you could do a horizontal barchart using
# coord_flip()
# (b) Boxplot
ggplot(msleep,aes(y=sleep_cycle)) +
geom_boxplot()
# Warning! Need some care with boxplot
# -- if there is only one then use y=... (there is only
# one thing on the x-axis)
# -- if side-by-side boxplots then use
# x=categorical, y=quantitative
# -- to switch to horizontal use coord_flip()
# The boxplot shows TWO outliers (dots)
# Calculate the upper fence: UQ+1.5*IQR
# -- DO NOT guesstimate from the plot
upper_fence = 0.5792 + 1.5*(0.5792-0.3333)
upper_fence
# [1] 0.94805
# or
sort(msleep$sleep_cycle)
# or
sort(pull(msleep,sleep_cycle))
# gives
# [1] 0.1166667 0.1166667 0.1333333 0.1500000 0.1500000 0.1666667 0.1833333
# [8] 0.1833333 0.1833333 0.2000000 0.2000000 0.2166667 0.2166667 0.2333333
# [15] 0.2833333 0.3333333 0.3333333 0.3500000 0.3833333 0.3833333 0.4166667
# [22] 0.4166667 0.5000000 0.5500000 0.6666667 0.6666667 0.7500000 0.7666667
# [29] 0.9000000 1.0000000 1.4166667 1.5000000
# so the third from highest value is 1.0 (which is also the top of
# the whisker in the boxplot)
# Accept as answer either 1.0 or 0.94805
# (c) Pipe, i.e., using %>%
msleep %>%
filter(sleep_total>=16,sleep_total<=18) %>%
select(name) %>%
arrange(name)
# (d) Pipe to build summary table
msleep %>%
group_by(conservation) %>%
summarise(count=n(),mean_bodywt=mean(bodywt))
# (e) Pipe including join
conservation_table = tribble(
~conservation, ~description,
"cd", "Conservation Dependent",
"domesticated", "Domesticated",
"en", "Endangered",
"lc", "Least Concern",
"nt", "Near Threatened",
"vu", "Vulnerable"
)
# both tables have a column/variable called conservation
# so we can use a join
msleep %>%
group_by(conservation) %>%
summarise(count=n(),mean_bodywt=mean(bodywt)) %>%
filter(mean_bodywt==max(mean_bodywt)) %>%
left_join(conservation_table) %>%
select(description)
# with output ...
# # A tibble: 1 x 1
# description#
# <chr>
# 1 Vulnerable
# Question 3
# Answers are the R commands in each case
# X~N(16,2^2), find P(X>20)
# mean is 16 and standard deviation is 2
# syntax: pnorm(x,mean,standard_deviation)
pnorm(70,75,12) - pnorm(60,75,12)
# [1] 0.02275013
# (b) Find x so that P(X<x)=0.2
qnorm(0.85,74,12)
# [1] 14.31676
# (c) 1kg=1000g
# total weight of box T follows a normal distribution
# with mean 66*16 (remember to add means)
# and variance 66*2^2 (remember to add variances)
# so standard deviation is sqrt(66*4)
pnorm(1000,66*16,sqrt(66*4))
# [1] 0.0002838844
# comment: very tiny chance of box having < 1kg
# -- the end --
library(tidyverse)
summary(diamonds)
diamonds %>%
filter() %>%
select(color) %>%
ggplot(diamonds,aes(x=carat)) +
geom_histogram(binwidth=0.02)
diamonds %>%
group_by(cut) %>%
summarise(number_diamond=n(), cor(carat, price))
library(tidyverse)
library(ggplot2)
loading = c(3,8,10,11,13,16,27,30,35,37,38,44,103,142)
removal = c(4,7,8,8,10,16,26,21,9,31,30,75,90)
wetland = tibble(loading,removal)
ggplot(wetland, aes(x=loading,y=removal)) +
geom_point() +
geom_smooth(method='lm', se=FALSE)