Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Data-Science/test prep.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
257 lines (213 sloc)
6.75 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -------------------------- | |
# Test Preparation Solutions | |
# (with feedback comments) | |
# R script | |
# -------------------------- | |
# ---------- | |
# Question 1 | |
# ---------- | |
# (code provided) | |
install.packages("ISwR") | |
library(tidyverse) | |
library(ISwR) | |
thuesen = as_tibble(thuesen) | |
model = lm(short.velocity~blood.glucose, data=thuesen) | |
summary(model) | |
# (a) Look at the output | |
# | |
# Call: | |
# lm(formula = short.velocity ~ blood.glucose, data = thuesen) | |
# | |
# Residuals: | |
# Min 1Q Median 3Q Max | |
# -0.40141 -0.14760 -0.02202 0.03001 0.43490 | |
# | |
# Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 1.09781 0.11748 9.345 6.26e-09 *** | |
# blood.glucose 0.02196 0.01045 2.101 0.0479 * | |
# --- | |
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 | |
# | |
# Residual standard error: 0.2167 on 21 degrees of freedom | |
# (1 observation deleted due to missingness) | |
# Multiple R-squared: 0.1737, Adjusted R-squared: 0.1343 | |
# F-statistic: 4.414 on 1 and 21 DF, p-value: 0.0479 | |
# | |
# * By looking at the Coefficients section of the output | |
# above, the line of best fit is: | |
# short.velocity = 1.09781 + 0.02196 * blood.glucose | |
# * The p-value corresponding to the blood.glucose | |
# coefficient, i.e., Pr(>|t|) in the Coefficients | |
# section of the output above, is 0.0479 < 0.05 | |
# so we say that "blood.glucose is a significant | |
# predictor of short.velocity", i.e., we would | |
# definitely keep it in the model. | |
# (b) Scatterplot code | |
ggplot(thuesen, aes(x=blood.glucose,y=short.velocity)) + | |
geom_point() + | |
geom_smooth(method="lm", se=FALSE) | |
# thuesen is the dataset | |
# aes() sets the x-axis to be blood.glucose | |
# and the y-axis to be short.velocity | |
# geom_point() draws the scatterplot of dots | |
# geom_smooth() draws the line of best fit | |
# (the linear model or "lm") | |
# (c) Mean of the residuals | |
# ... is ALWAYS zero | |
# ... OR modify the code provided | |
library(broom) | |
mean(pull(augment(model),.resid)) | |
# with output: | |
# [1] -3.613222e-18 | |
# ... which is scientific notation for | |
# 3.613222 with the decimal place shifted 18 places to the left | |
# i.e. 0.000000000000000003613222 | |
# Note that summary(model) gives the median and | |
# quartiles of the residuals but not mean | |
# (d) Diagnostic plots (code provided) | |
library(ggfortify) | |
autoplot(model) | |
# Residuals vs Fitted values: ok, blue line approx horizontal | |
# no nonlinear curve or funneling | |
# Normal Q-Q: the upper right points are a concern as they do not | |
# lie on the dashed line | |
# Scale-Location: ok | |
# Residuals vs Leverage: some outliers | |
# (e) 95% confidence interval for correlation coefficient | |
# need to use cor.test() not just cor() | |
cor.test(thuesen$blood.glucose,thuesen$short.velocity) | |
# or | |
cor.test(pull(thuesen,blood.glucose),pull(thuesen,short.velocity)) | |
# or | |
x = thuesen$blood.glucose | |
y = thuesen$short.velocity | |
cor(x,y) # gives NA due to some missing values | |
cor.test(x,y) | |
# or | |
x = pull(thuesen,blood.glucose) | |
y = pull(thuesen,short.velocity) | |
cor(x,y) # gives NA due to some missing values | |
cor.test(x,y) | |
# In each case cor.test() gives | |
# 95 percent confidence interval: | |
# 0.005496682 0.707429479 | |
# This confidence interval DOES NOT include zero, so we can conclude | |
# that in the population blood.glucose and short.velocity are | |
# positively correlated | |
# (but very unsure whether weak/moderate/strong) | |
# ---------- | |
# Question 2 | |
# ---------- | |
# (code provided) | |
library(tidyverse) | |
msleep | |
# (a) Barchart | |
ggplot(msleep,aes(x=vore)) + | |
geom_bar() | |
# Herbivore (herbi) is the most frequent vore | |
# msleep is the dataset | |
# aes() sets the x-axis to be vore | |
# and there is NO y-axis (this will be | |
# the counts) | |
# geom_bar() builds a frequency table and | |
# draws the bars | |
# you could do a horizontal barchart using | |
# coord_flip() | |
# (b) Boxplot | |
ggplot(msleep,aes(y=sleep_cycle)) + | |
geom_boxplot() | |
# Warning! Need some care with boxplot | |
# -- if there is only one then use y=... (there is only | |
# one thing on the x-axis) | |
# -- if side-by-side boxplots then use | |
# x=categorical, y=quantitative | |
# -- to switch to horizontal use coord_flip() | |
# The boxplot shows TWO outliers (dots) | |
# Calculate the upper fence: UQ+1.5*IQR | |
# -- DO NOT guesstimate from the plot | |
upper_fence = 0.5792 + 1.5*(0.5792-0.3333) | |
upper_fence | |
# [1] 0.94805 | |
# or | |
sort(msleep$sleep_cycle) | |
# or | |
sort(pull(msleep,sleep_cycle)) | |
# gives | |
# [1] 0.1166667 0.1166667 0.1333333 0.1500000 0.1500000 0.1666667 0.1833333 | |
# [8] 0.1833333 0.1833333 0.2000000 0.2000000 0.2166667 0.2166667 0.2333333 | |
# [15] 0.2833333 0.3333333 0.3333333 0.3500000 0.3833333 0.3833333 0.4166667 | |
# [22] 0.4166667 0.5000000 0.5500000 0.6666667 0.6666667 0.7500000 0.7666667 | |
# [29] 0.9000000 1.0000000 1.4166667 1.5000000 | |
# so the third from highest value is 1.0 (which is also the top of | |
# the whisker in the boxplot) | |
# Accept as answer either 1.0 or 0.94805 | |
# (c) Pipe, i.e., using %>% | |
msleep %>% | |
filter(sleep_total>=16,sleep_total<=18) %>% | |
select(name) %>% | |
arrange(name) | |
# (d) Pipe to build summary table | |
msleep %>% | |
group_by(conservation) %>% | |
summarise(count=n(),mean_bodywt=mean(bodywt)) | |
# (e) Pipe including join | |
conservation_table = tribble( | |
~conservation, ~description, | |
"cd", "Conservation Dependent", | |
"domesticated", "Domesticated", | |
"en", "Endangered", | |
"lc", "Least Concern", | |
"nt", "Near Threatened", | |
"vu", "Vulnerable" | |
) | |
# both tables have a column/variable called conservation | |
# so we can use a join | |
msleep %>% | |
group_by(conservation) %>% | |
summarise(count=n(),mean_bodywt=mean(bodywt)) %>% | |
filter(mean_bodywt==max(mean_bodywt)) %>% | |
left_join(conservation_table) %>% | |
select(description) | |
# with output ... | |
# # A tibble: 1 x 1 | |
# description# | |
# <chr> | |
# 1 Vulnerable | |
# Question 3 | |
# Answers are the R commands in each case | |
# X~N(16,2^2), find P(X>20) | |
# mean is 16 and standard deviation is 2 | |
# syntax: pnorm(x,mean,standard_deviation) | |
pnorm(70,75,12) - pnorm(60,75,12) | |
# [1] 0.02275013 | |
# (b) Find x so that P(X<x)=0.2 | |
qnorm(0.85,74,12) | |
# [1] 14.31676 | |
# (c) 1kg=1000g | |
# total weight of box T follows a normal distribution | |
# with mean 66*16 (remember to add means) | |
# and variance 66*2^2 (remember to add variances) | |
# so standard deviation is sqrt(66*4) | |
pnorm(1000,66*16,sqrt(66*4)) | |
# [1] 0.0002838844 | |
# comment: very tiny chance of box having < 1kg | |
# -- the end -- | |
library(tidyverse) | |
summary(diamonds) | |
diamonds %>% | |
filter() %>% | |
select(color) %>% | |
ggplot(diamonds,aes(x=carat)) + | |
geom_histogram(binwidth=0.02) | |
diamonds %>% | |
group_by(cut) %>% | |
summarise(number_diamond=n(), cor(carat, price)) | |
library(tidyverse) | |
library(ggplot2) | |
loading = c(3,8,10,11,13,16,27,30,35,37,38,44,103,142) | |
removal = c(4,7,8,8,10,16,26,21,9,31,30,75,90) | |
wetland = tibble(loading,removal) | |
ggplot(wetland, aes(x=loading,y=removal)) + | |
geom_point() + | |
geom_smooth(method='lm', se=FALSE) | |