• R语言的优势:免费/开源/适合统计及数据分析/语法渐/用户广/功能全/有最好的数据可视化效果

  • R语言的缺点:性能相对较差,多核支持差,难以用对大文本处理

  • 学习R的误区

  1. 错误的学习路线,对数据类型浅尝辄止
  2. 不重视基础知识,盲目的使用别人提供脚本
  3. 缺乏练习,不愿动手

学习一门变成语言为什么从数据类型开始?

  1. 难点:数据类型相关的知识点既多又复杂
  2. 重点:数据整理是数据分析的第一步
  3. 易错点: 90%的bug都是因为数据类型误用引起的

R语言数据类型_基本类型

  • 数值型
  • 字符型
  • 逻辑型

numeric(数字型)

// 我也不知道1.2去哪了

  • 类型:integer, double
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    a_integer <- 1
    a_double <- 0.1

    is.integer(a_integer)
    //false
    is.double(a_double)
    //true

    a_integer <- as.integer(1)
    isinteger(a)integer)
    //true
    R中的数字默认为double型

charactr(字符串)

1
2
3
4
5
6
7
8
9
a_character <-"Hello World"
a_num_character<- "123"
is.character(a_character)
#true
is.numeric(a_num_character)
#false
a_num_character <- as.numeric(a_num_character)
is.numeric(a_numcharacter)
#true
1
2
3
4
5
6
paste("Hello_World","123",sep = "|")
#"Hello-World|123"

substr1<- substr("Hello_World",start= 2,stop = 5)


logical(Boolean,逻辑型)

1
2
3
4
5
6
7
8
9
a_logical <-true
is.logical(a_logical)
#true
a_numeric<- 1
is.logical(a_numeric)
#false
a_numeric <- as.logical(a_numeric)
is.logical(a_numeric)
#true

R语言数据类型_多维数据类型

  • vertor(向量)
  • Matrix(矩阵)
  • Array
  • Data Frame(Table)
  • Lists

vector(向量)

  • 类型:三种,numeric,character,logical
    1
    2
    3
    4
    5
    6
    7
    8
    b_numeric_vector_1 <-c(1,2,3)
    b_numeric_vector_2 <-c(1:3)
    b_numeric_vector_3 <-c(1,2:3)

    b_logical_vector_2 <- c(true,false,true)

    b_character_vertor <- c("a","b","c")

matrix

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

matrix1 <- matrix(c(1:50),nrow = 5 )
#5行10列
matrix2 <- matrix(c(1:50),nrow = 5 , byrow = true)
#5行10列,按行排列

a100 <- c(1:100)
matrix3 <- matrix(a100,nrow = 10)
#10行10列
matrix3[10,5:7] <- c(1,2,3);
#取10行5到7列

#martrix 运算
matrix3*matrix3
#相应位置相乘
matrix3+matrix3
#同理
matrix3*c(1:10)
#第一行*1,第二行*2,.....

DataFrame(数据框)

  • 构建函数:data.frame
  • 接受的参数:多个等长的向量,参数的名字将转换为列名
1
2
3
4
5
6
7
name <-c("Bob","Jak","Li","Wang","Zhang")
score <- c(80,70,60,90,45)
class <-c(1,2,1,2,2)
student_score <- data.frame(name = name, score = score, class = class)

#数据框有许多去取值方法,详见Google

List(特殊的向量)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
list1 <- list(1,2,3,4,5,6)

list1[C(1:3)]

list2 <- list(c(1:3),c(1:2),"hellp",matrix(c(1:100),nrow =10),list(1,2,3,4,5,6))

#用[]取出来的还是一个子list,用[[]]取出来的才是list里的值

### 二.基本语法
#### 2.1for
````r
for(i in c(1:100)) {
k <- K+1
}
print(k)
#5050
for (name in c("a","b", "c" )) {
print (name)
}

while

1
2
3
4
5
6
7
8
9
10
11
12
i<- 10 
while (i>0)
{
i<- i-1
print(i)
}

#死循环
while(true)
{
print ("true")
}

if/else

1
2
3
4
5
6
7
8
9
10
socre = 60
if(score> = 90){
print ("excellent")
}
else if(score >= 60 &score<90){
print ("good")
}
else {
print ("bad")
}

switch

1
2
3
x<- 3
switch (x,"first","scond", "third","fourth")
# "third"

函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
strsplit("abcde",split = " ")
#"a" "b" "c" "d" "e"

grep ("ab",c("ab","bc","ab", "abc")
# 1 3 4

length(c(1:100))
#100

#还原内置函数length

length_out <- function(x){
l <- 0;
for(i in x)
{
l<- 1+1
}
return(l)
}

第一次作业-R语言基础

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#202213093015李健豪
#4.1 创建leadership数据框

manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender,
age,q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
print(leadership)

#4.2 创建新变量
#第一种方式
mydata<-data.frame(x1 = c(2, 2, 6, 4),x2 = c(3, 4, 2, 8))
mydata$sumx <- mydata$x1 + mydata$x2
mydata$meanx <- (mydata$x1 + mydata$x2)/2
attach(mydata)
#attach()在R语言中表示添加路径存储的索引,相当于绑定一个数据框
#,如果未绑定路径索引,可能导致数据读取错误。
print(mydata$sumx)
#第二种方式
mydata$sumx <- x1 + x2
mydata$meanx <- (x1 + x2)/2
print (mydata$sumx)
detach(mydata)
#detach()正好相反就是解除数据路径存储的绑定
#第三种方式
mydata <- transform(mydata,sumx = x1+x2,meanx = (x1 + x2)/2)
print(mydata$sumx)

#4.3 变量的重编码
leadership$age[leadership$age==99] <-NA
leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[leadership$age >= 55 & leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"
#语句variable[condition] <- expression将仅在condition的值为TRUE时执行赋值。

#也可以简写为
leadership <- within(leadership,{
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age >= 55 & age <= 75] <- "Middle Aged"
agecat[age < 55] <- "Young" })

print(leadership)

#4.4变量的重命名
#可以使用语句:
#fix(leadership)
#来调用一个交互式的编辑器。然后你单击变量名,然后在弹出的对话框中将其重命名.
#若以编程方式,可以通过names()函数来重命名变量
#如names(leadership)[2] <- "testDate"

#例
names(leadership)[6:10] <- c("item1", "item2", "item3", "item4", "item5")
#也可以用plyr包的rename函数
install.packages("plyr")
library(plyr)
leadership <- rename(leadership, c(manager="managerID", date="testDate"))

#4.5缺失值
#在任何规模的项目中,数据都可能由于未作答问题、设备故障或误编码数据的缘故而不完整。
#在R中,缺失值以符号NA(Not Available,不可用)表示。
#函数is.na()允许你检测缺失值是否存在。
is.na(leadership[,6:10])

#4.5.1重编码某些值为缺失值
leadership$age[leadership$age == 99] <- NA

#4.5.2在分析中排除缺失值

x <- c(1, 2, NA, 3)
y <- x[1] + x[2] + x[3] + x[4]
z <- sum(x)
#由于x中的第3个元素是缺失值,所以y和z也都是NA(缺失值)

x <- c(1, 2, NA, 3)
y <- sum(x, na.rm=TRUE)
#多数的数值函数都拥有一个na.rm=TRUE选项,可以在计算之前移除缺失值并使用
#剩余值进行计算

#例
newdata <- na.omit(leadership)
newdata

#4.6日期值
#日期值通常以字符串的形式输入到R中,然后转化为以数值形式存储的日期变量。
#其语法为as.Date(x, "input_format"),其中x是字符型数
#据,input_format则给出了用于读入日期的适当格式

myformat <- "%m/%d/%y"
leadership$date <- as.Date(leadership$date, myformat)
leadership$date

#返回当前的日期和时间
Sys.Date()

#来输出指定格式的日期值,并且可以提取日期值中的某些部分
today <- Sys.Date()
format(today, format="%B %d %Y")
format(today, format="%A")

#可以在日期值上执行算术运算
startdate <- as.Date("2004-02-13")
enddate <- as.Date("2011-01-22")
days <- enddate - startdate
days

#以使用函数difftime()来计算时间间隔
today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")

#4.6.1 将日期转换为字符型变量
#函数as.character()可将日期值转换为字符型
#strDates <- as.character(dates)

#4.7 类型转换
#向一个数值型向量中添加一个字符串会将此向量中的所有元素转换为字符型。
a <- c(1,2,3)
a
is.numeric(a)
is.vector(a)
a <- as.character(a)
a
is.numeric(a)
is.vector(a)
is.character(a)

#4.8 数据排序
#order()函数对一个数据框进行排序。默认的排序顺序是升序。

#例
newdata <- leadership[order(leadership$age),]
attach(leadership)
newdata <- leadership[order(gender, age),]
detach(leadership)
attach(leadership)
newdata <-leadership[order(gender, -age),]
detach(leadership)

#4.9 数据集的合并

#4.9.1 向数据框添加列
#使用merge()函数
#例如total <- merge(dataframeA, dataframeB, by="ID")
#或total <- merge(dataframeA, dataframeB, by=c("ID","Country"))

#4.9.2 向数据框添加行
#使用rbind()函数
#例如total <- rbind(dataframeA, dataframeB)

#4.10 数据集取子集

#4.10.1 选入(保留)变量
#例
newdata <- leadership[, c(6:10)]
myvars <- c("q1", "q2", "q3", "q4", "q5")
newdata <-leadership[myvars]
myvars <- paste("q", 1:5, sep="")
newdata <- leadership[myvars]
#注意leadership的列名,是itemN还是qN

#4.10.2 剔除(丢弃)变量
#例
myvars <- names(leadership) %in% c("q3", "q4")
newdata <- leadership[!myvars]

#在知道q3和q4是第8个和第9个变量的情况下,可以使用语句:
newdata <- leadership[c(-8,-9)]

#也可以用
leadership$q3 <- leadership$q4 <- NULL

#4.10.3 选入观测
#例
newdata <- leadership[1:3,]
newdata <- leadership[leadership$gender=="M" & leadership$age > 30,]
attach(leadership)
newdata <- leadership[gender=='M' & age > 30,]
detach(leadership)

#4.10.4 subset()函数
#用subset()函数是选择变量和观测最简单的方法
#例
newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4))
newdata <- subset(leadership, gender=="M" & age > 25,select=gender:q4)

#4.10.5 随机抽样
#使用以下语句从leadership数据集中随机抽取一个大小为3的样本
mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]

#sample()函数中的第一个参数是一个由要从中抽样的元素组成的向量。在这里,这个向量
#是1到数据框中观测的数量,第二个参数是要抽取的元素数量,第三个参数表示无放回抽样。
#sample()函数会返回随机抽样得到的元素,之后即可用于选择数据框中的行。

#4.11 使用 SQL 语句操作数据框

install.packages("sqldf")

#可以使用sqldf()函数在数据框上使用SQL中的SELECT语句
#例
library(sqldf)
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",row.names=TRUE)
newdf

第二次作业-数据可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
## 6.1 条形图

### 6.1.1 简单的条形图

#若height是一个向量,则它的值就确定了各条形的高度,并将绘制一幅垂直的条形图。使
#用选项horiz=TRUE则会生成一幅水平条形图。你也可以添加标注选项。选项main可添加一个图
#形标题,而选项xlab和ylab则会分别添加x轴和y轴标签

install.packages("vcd")
library(vcd)
counts <- table(Arthritis$Improved)
counts
barplot(counts, main="Simple Bar Plot", xlab="Improvement", ylab="Frequency") #简单条形图
barplot(counts, main="Horizontal Bar Plot", xlab="Frequency", ylab="Improvement", horiz=TRUE)#分组条形图

### 6.1.2 堆砌条形图和分组条形图

#如果height是一个矩阵而不是一个向量,则绘图结果将是一幅堆砌条形图或分组条形图。
#若beside=FALSE(默认值),则矩阵中的每一列都将生成图中的一个条形,各列中的值将给出
#堆砌的“子条”的高度。若beside=TRUE,则矩阵中的每一列都表示一个分组,各列中的值将
#并列而不是堆砌。

library(vcd)
counts <- table(Arthritis$Improved, Arthritis$Treatment)
counts
barplot(counts, main="Stacked Bar Plot", xlab="Treatment", ylab="Frequency", col=c("red", "yellow","green"), legend=rownames(counts)) #堆砌条形图
barplot(counts, main="Grouped Bar Plot", xlab="Treatment", ylab="Frequency", col=c("red", "yellow", "green"), legend=rownames(counts), beside=TRUE)#分组条形图

### 6.1.3 均值条形图

states <- data.frame(state.region, state.x77)
means <- aggregate(states$Illiteracy, by=list(state.region), FUN=mean)
means

means <- means[order(means$x),]
means

barplot(means$x, names.arg=means$Group.1)
title("Mean Illiteracy Rate")

### 6.1.4 条形图的微调

par(mar=c(5,8,4,2)) # 增加y边界的大小
par(las=2)#旋转条形的标签
counts <- table(Arthritis$Improved)
barplot(counts, main="Treatment Outcome", horiz=TRUE, cex.names=0.8,
names.arg=c("No Improvement", "Some Improvement", "Marked Improvement"))

### 6.1.5 棘状图

library(vcd)
attach(Arthritis)
counts <- table(Treatment, Improved)
spine(counts, main="Spinogram Example")
detach(Arthritis)

## 6.2 饼图
#将四个图形组合为一幅
par(mfrow=c(2, 2))
slices <- c(10, 12,4, 16, 8)
lbls <- c("US", "UK", "Australia", "Germany", "France")
pie(slices, labels = lbls, main="Simple Pie Chart")

#为饼图添加比例数值
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels=lbls2, col=rainbow(length(lbls2)),
main="Pie Chart with Percentages")

library(plotrix)
pie3D(slices, labels=lbls,explode=0.1, main="3D Pie Chart ")
mytable <- table(state.region)
lbls3 <- paste(names(mytable), "\n", mytable, sep="")
pie(mytable, labels = lbls3,
main="Pie Chart from a Table\n (with sample sizes)")

#扇形图
install.packages("plotrix")
library(plotrix)
slices <- c(10, 12,4, 16, 8)
lbls <- c("US", "UK", "Australia", "Germany", "France")
fan.plot(slices, labels = lbls, main="Fan Plot")

## 6.3 直方图

par(mfrow=c(2,2))

hist(mtcars$mpg)
hist(mtcars$mpg, breaks=12, col="red", xlab="Miles Per Gallon", main="Colored histogram with 12 bins")
hist(mtcars$mpg, freq=FALSE, breaks=12, col="red", xlab="Miles Per Gallon", main="Histogram, rug plot, density curve")
rug(jitter(mtcars$mpg))
lines(density(mtcars$mpg), col="blue", lwd=2)
x <- mtcars$mpg
h<-hist(x, breaks=12, col="red", xlab="Miles Per Gallon", main="Histogram with normal curve and box")
xfit<-seq(min(x), max(x), length=40)
yfit<-dnorm(xfit, mean=mean(x), sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
box()

rug(jitter(mtcars$mpag, amount=0.01))


## 6.4 核密度图
par(mfrow=c(2,1))
d <- density(mtcars$mpg)
plot(d)
d <- density(mtcars$mpg)
plot(d, main="Kernel Density of Miles Per Gallon")
polygon(d, col="red", border="blue")
rug(mtcars$mpg, col="brown")

#可比较的核密度图

install.packages("sm")

library(sm)
attach(mtcars)
cyl.f <- factor(cyl, levels= c(4,6,8), labels = c("4 cylinder", "6 cylinder","8 cylinder"))
sm.density.compare(mpg, cyl, xlab="Miles Per Gallon")
title(main="MPG Distribution by Car Cylinders")
colfill<-c(2:(1+length(levels(cyl.f))))
legend(locator(1), levels(cyl.f), fill=colfill)
detach(mtcars)

## 6.5 箱线图

### 6.5.1 使用并列箱线图进行跨组比较
boxplot(mpg ~ cyl, data=mtcars, main="Car Mileage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon")

#含凹槽的箱线图
boxplot(mpg ~ cyl, data=mtcars, notch=TRUE, varwidth=TRUE,col="red",main="Car Mileage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon")

#两个因子交叉的箱线图
mtcars$cyl.f <- factor(mtcars$cyl, levels=c(4,6,8), labels=c("4","6","8"))
mtcars$am.f <- factor(mtcars$am, levels=c(0,1), labels=c("auto", "standard"))
boxplot(mpg ~ am.f *cyl.f, data=mtcars, varwidth=TRUE, col=c("gold","darkgreen"), main="MPG Distribution by Auto Type", xlab="Auto Type", ylab="Miles Per Gallon")


### 6.5.2 小提琴图

install.packages("vioplot")
library(vioplot)
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
vioplot(x1, x2, x3, names=c("4 cyl", "6 cyl", "8 cyl"),col="gold")
title("Violin Plots of Miles Per Gallon", ylab="Miles Per Gallon", xlab="Number of Cylinders")


## 6.6 点图

dotchart(mtcars$mpg, labels=row.names(mtcars), cex=.7, main="Gas Mileage for Car Models", xlab="Miles Per Gallon")

#分组,排序,着色后的点图
x <- mtcars[order(mtcars$mpg),]
x$cyl <- factor(x$cyl)
x$color[x$cyl==4] <- "red"
x$color[x$cyl==6] <- "blue"
x$color[x$cyl==8] <- "darkgreen"
dotchart(x$mpg, labels = row.names(x),cex=.7, groups = x$cyl, gcolor = "black",
color = x$color,pch=19, main = "Gas Mileage for Car Models\ngrouped by cylinder", xlab = "Miles Per Gallon")

第三次作业-基本统计分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#7.1 描述性统计分析
myvars <- c("mpg", "hp", "wt")
head(mtcars[myvars])
#按照变速箱类型(am)和汽缸数(cyl)考察描述性统计量

#7.1.1 方法云集
myvars <- c("mpg", "hp", "wt")

#通过summary()计算描述性统计量
summary(mtcars[myvars])

#通过sapply()计算描述性统计量
mystats <- function(x, na.omit=FALSE){
if (na.omit)
x <- x[!is.na(x)]
m <- mean(x)
n <- length(x)
s <- sd(x)
skew <- sum((x-m)^3/s^3)/n
kurt <- sum((x-m)^4/s^4)/n - 3
return(c(n=n, mean=m, stdev=s, skew=skew, kurtosis=kurt))
}
myvars <- c("mpg", "hp", "wt")
sapply(mtcars[myvars], mystats)

#7.1.2 更多方法

#通过Hmisc包中的describe()函数计算描述性统计量
install.packages("Hmisc")
library(Hmisc)
myvars <- c("mpg", "hp", "wt")
describe(mtcars[myvars])

#通过pastecs包中的stat.desc()函数计算描述性统计量
install.packages("pastecs")
library(pastecs)
stat.desc(mtcars[myvars])

#通过psych包中的describe()计算描述性统计量
install.packages("psych")
library(psych)
describe(mtcars[myvars])

#7.1.3 分组计算描述性统计量

#使用aggregate()分组获取描述性统计
myvars <- c("mpg", "hp", "wt")
aggregate(mtcars[myvars], by=list(am=mtcars$am), mean)

#使用by()分组计算描述性统计量
dstats <- function(x)sapply(x, mystats)
by(mtcars[myvars], mtcars$am, dstats)

#7.1.4 分组计算的扩展

#使用doBy包中的summaryBy()分组计算概述统计量
install.packages("doBy")
library(doBy)
summaryBy(mpg+hp+wt~am, data=mtcars, FUN=mystats)

#使用psych包中的describeBy()分组计算概述统计量
library(psych)
describeBy(mtcars[myvars], list(am=mtcars$am))

#7.2 频数表和列联表

install.packages("vcd")
library(vcd)
head(Arthritis)

#7.2.1 生成频数表

#1. 一维列联表
#以使用table()函数生成简单的频数统计表
mytable <- with(Arthritis, table(Improved))
mytable
#用prop.table()将这些频数转化为比例值
prop.table(mytable)

#或使用prop.table()*100转化为百分比
prop.table(mytable)*100

#2. 二维列联表
mytable <- xtabs(~ Treatment+Improved, data=Arthritis)
mytable

#使用margin.table()和prop.table()函数分别生成边际频数和比例
margin.table(mytable, 1)
prop.table(mytable, 1)
margin.table(mytable, 2)
prop.table(mytable, 2)

#使用CrossTable生成二维列联表
install.packages("gmodels")
library(gmodels)
CrossTable(Arthritis$Treatment, Arthritis$Improved)

#3. 多维列联表
mytable <- xtabs(~ Treatment+Sex+Improved, data=Arthritis)
mytable

ftable(mytable)

margin.table(mytable, 1)

margin.table(mytable, 2)

margin.table(mytable, c(1, 3))

ftable(addmargins(prop.table(mytable, c(1, 2)), 3))

#7.2.2 独立性检验

#卡方独立性检验
library(vcd)
mytable <- xtabs(~Treatment+Improved, data=Arthritis)
chisq.test(mytable)
mytable <- xtabs(~Improved+Sex, data=Arthritis)
chisq.test(mytable)

#2. Fisher精确检验

mytable <- xtabs(~Treatment+Improved, data=Arthritis)
fisher.test(mytable)

#3. Cochran-Mantel-Haenszel检验
mytable <- xtabs(~Treatment+Improved+Sex, data=Arthritis)
mantelhaen.test(mytable)

#7.2.3 相关性的度量
#二维列联表的相关性度量

mytable <- xtabs(~Treatment+Improved, data=Arthritis)
assocstats(mytable)

#7.3 相关

#7.3.1 相关的类型

#1. Pearson、Spearman和Kendall相关
#协方差和相关系数
states<- state.x77[,1:6]

cov(states)

cor(states)

cor(states, method="spearman")


#可以计算非方形的相关矩阵
x <- states[,c("Population", "Income", "Illiteracy", "HS Grad")]
y <- states[,c("Life Exp", "Murder")]
cor(x,y)

#2. 偏相关
install.packages("ggm")
library(ggm)
colnames(states)
pcor(c(1,5,2,3,6), cov(states))

#7.3.2 相关性的显著性检验
cor.test(states[,3], states[,5])

#通过corr.test计算相关矩阵并进行显著性检验
library(psych)
corr.test(states, use="complete")

#7.4 t 检验

install.packages("MASS")
library(MASS)
t.test(Prob ~ So, data=UScrime)

#7.4.2 非独立样本的 t 检验
sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(x))))

#7.5.1 两组的比较
with(UScrime, by(Prob, So, median))

sapply(UScrime[c("U1","U2")], median)

with(UScrime, wilcox.test(U1, U2, paired=TRUE))

#7.5.2 多于两组的比较

states <- data.frame(state.region, state.x77)
kruskal.test(Illiteracy ~ state.region, data=states)

第四次作业-回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# 回归
# 8.2.2 简单线性回归

fit <- lm(weight ~ height, data=women)
summary(fit)

women$weight
fitted(fit)
residuals(fit)
plot(women$height,women$weight, xlab="Height (in inches)", ylab="Weight (in pounds)")
abline(fit)

# 8.2.3 多项式回归
fit2 <- lm(weight ~ height + I(height^2), data=women)
summary(fit2)
plot(women$height,women$weight, xlab="Height (in inches)", ylab="Weight (in lbs)")
lines(women$height,fitted(fit2))

# n次多项式生成一个n–1个弯曲的曲线。拟合三次多项式,可用
fit3 <- lm(weight ~ height + I(height^2) +I(height^3), data=women)

# car包中的scatterplot()函数,它可以很容易、方便地绘制二元关系图
install.packages("car")
library(car)
scatterplot(weight ~ height, data=women, spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Women Age 30-39", xlab="Height (inches)", ylab="Weight (lbs.)")
#There were 12 warnings (use warnings() to see them)

#8.2.4 多元线性回归

#以基础包中的state.x77数据集为例,探究一个州的犯罪率和其他因素的关系
#lm()函数需要一个数据框(state.x77数据集是矩阵),为了以后处理方便,需要做如下转化
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
cor(states)
library(car)
scatterplotMatrix(states, spread=FALSE, smoother.args=list(lty=2), main="Scatter Plot Matrix")
# There were 50 or more warnings (use warnings() to see the first 50)
warning()
# Warning message: 啥也没有

# 使用lm()函数拟合多元线性回归模型
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
lm(formula=Murder ~ Population + Illiteracy + Income + Frost, data=states)

#8.2.5 有交互项的多元线性回归
fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
summary(fit)
#通过effects包中的effect()函数,可以用图形展示交互项的结果
install.packages("effects")
library(effects)
plot(effect("hp:wt", fit,, list(wt=c(2.2,3.2,4.2))), multiline=TRUE)

# 8.3 回归诊断

states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
confint(fit)

# 8.3.1 标准方法

# 对lm()函数返回的对象使用plot()函数,可以生成评价模型拟合情况的四幅图形
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)

# 二次拟合的诊断图
fit2 <- lm(weight ~ height + I(height^2), data=women)
par(mfrow=c(2,2))
plot(fit2)

# 删除观测点13和15,模型会拟合得会更好
newfit <- lm(weight~ height + I(height^2), data=women[-c(13,15),])

# states的多元回归问题
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
par(mfrow=c(2,2))
plot(fit)

# 8.3.2 改进的方法

# 1.正态性
#与基础包中的plot()函数相比,qqPlot()函数提供了更为精确的正态假设检验方法
library(car)
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
qqPlot(fit, labels=row.names(states), id.method="identify",
simulate=TRUE, main="Q-Q Plot")

# 绘制学生化残差图的函数
residplot <- function(fit, nbreaks=10)
{ z <- rstudent(fit)
hist(z, breaks=nbreaks, freq=FALSE, xlab="Studentized Residual", main="Distribution of Errors")
rug(jitter(z), col="brown")
curve(dnorm(x, mean=mean(z), sd=sd(z)),
add=TRUE, col="blue", lwd=2)
lines(density(z)$x, density(z)$y, col="red", lwd=2, lty=2)
legend("topright", legend = c( "Normal Curve", "Kernel Density Curve"),
lty=1:2, col=c("blue","red"), cex=.7)
}
residplot(fit)

# 2. 误差的独立性
durbinWatsonTest(fit)

# 3. 线性
library(car)
crPlots(fit)

# 4. 同方差性
#ncvTest()函数生成一个计分检验,零假设为误差方差不变,备择假设为误差方差随着拟合值水平的变化而变化。
library(car)
ncvTest(fit)
spreadLevelPlot(fit)

# 8.3.3 线性模型假设的综合验证
install.packages("gvlma")
library(gvlma)
gvmodel <- gvlma(fit)
summary(gvmodel)

# 8.4.1 离群点
#outlierTest()函数可以求得最大标准化残差绝对值Bonferroni调整后的p值
library(car)
outlierTest(fit)

# 8.4.2 高杠杆值点

hat.plot <- function(fit) {
p <- length(coefficients(fit))
n <- length(fitted(fit))
plot(hatvalues(fit), main="Index Plot of Hat Values")
abline(h=c(2,3)*p/n, col="red", lty=2)
identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(fit)

# 8.4.3 强影响点

cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
plot(fit, which=4, cook.levels=cutoff)
abline(h=cutoff, lty=2, col="red")

library(car)
avPlots(fit, ask=FALSE, id.method="identify")

# 8.5.1 删除观测点

# 8.5.2 变量变换
#当模型不符合正态性、线性或者同方差性假设时,一个或多个变量的变换通常可以改善或调整模型效果
library(car)
summary(powerTransform(states$Murder))

# 8.6.1 模型比较
# 用anova()函数比较
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
anova(fit2, fit1)

#用AIC来比较模型
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
AIC(fit1,fit2)

# 8.6.2 变量选择

# 1. 逐步回归
library(MASS)
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
stepAIC(fit, direction="backward")

# 2. 全子集回归
install.packages("leaps")
library(leaps)
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
leaps <-regsubsets(Murder ~ Population + Illiteracy + Income + Frost, data=states, nbest=4)
plot(leaps, scale="adjr2")
library(car)
subsets(leaps, statistic="cp", main="Cp Plot for All Subsets Regression")
abline(1,1,lty=2,col="red")

# 8.7 深层次分析

# 8.7.1 交叉验证
# R平方的k重交叉验证
install.packages("bootstrap")
library(bootstrap)
shrinkage <- function(fit, k=10){
require(bootstrap)
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
x <- fit$model[,2:ncol(fit$model)]
y <- fit$model[,1]
results <- crossval(x, y, theta.fit, theta.predict, ngroup=k)
r2 <- cor(y, fit$fitted.values)^2
r2cv <- cor(y, results$cv.fit)^2
cat("Original R-square =", r2, "\n")
cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")
cat("Change =", r2-r2cv, "\n")
}
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states)
shrinkage(fit)

# 8.7.2 相对重要性
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
zstates <- as.data.frame(scale(states))
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)

# relweights()函数,计算预测变量的相对权重
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
import <- as.data.frame(import)
row.names(import) <- names(fit$model[2:nvar])
names(import) <- "Weights"
import <- import[order(import),1, drop=FALSE]
dotchart(import$Weights, labels=row.names(import),
xlab="% of R-Square", pch=19,
main="Relative Importance of Predictor Variables",
sub=paste("Total R-Square=", round(rsquare, digits=3)),
...)
return(import)
}

# relweights()函数的应用
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
relweights(fit, col="blue")

第五次作业-重抽样与自助法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# 第12章 重抽样与自助法
#12.1 置换检验
install.packages(c("coin"))
install.packages(file.choose(), repos=NULL, type="source")

#12.2 用 coin 包做置换检验
#12.2.1 独立两样本和 K 样本检验
library(coin)
score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)
treatment <- factor(c(rep("A",5), rep("B",5)))
mydata <- data.frame(treatment, score)
t.test(score~treatment, data=mydata, var.equal=TRUE)
oneway_test(score~treatment, data=mydata, distribution="exact")

#现使用Wilcoxon秩和检验美国南部监禁概率与非南部间的差异

library(MASS)
UScrime <- transform(UScrime, So = factor(So))
wilcox_test(Prob ~ So, data=UScrime, distribution="exact")

#对于50个患者的样本,我们使用了单因素方差分析
#来评价五种药物疗法对降低胆固醇的效果。下面代码对其做了近似的K样本置换检验
install.packages("multcomp")
library(multcomp)
set.seed(1234)
oneway_test(response~trt, data=cholesterol,distribution=approximate(B=9999))

#12.2.2 列联表中的独立性
#实施卡方检验的置换版本,可用如下代码:
library(coin)
library(vcd)
Arthritis <- transform(Arthritis, Improved=as.factor(as.numeric(Improved)))
set.seed(1234)
chisq_test(Treatment~Improved, data=Arthritis, distribution=approximate(B=9999))

#12.2.3 数值变量间的独立性
#美国文盲率与谋杀率间相关性的置换检验
states <- as.data.frame(state.x77)
set.seed(1234)
spearman_test(Illiteracy~Murder, data=states,distribution=approximate(B=9999))

#12.2.4 两样本和 K 样本相关性检验
library(coin)
library(MASS)
wilcoxsign_test(U1~U2, data=UScrime, distribution="exact")

#12.3 lmPerm 包的置换检验

#12.3.1 简单回归和多项式回归

install.packages("lmPerm")
library(lmPerm)
set.seed(1234)
fit <- lmp(weight~height, data=women, perm="Prob")
summary(fit)


#多项式回归的置换检验
set.seed(1234)
fit <- lmp(weight~height + I(height^2), data=women, perm="Prob")
summary(fit)

#12.3.2 多元回归
# 多元回归的置换检验
set.seed(1234)
states <- as.data.frame(state.x77)
fit <- lmp(Murder~Population + Illiteracy+Income+Frost, data=states, perm="Prob")
summary(fit)

#12.3.3 单因素方差分析和协方差分析
#单因素方差分析的置换检验
library(lmPerm)
library(multcomp)
set.seed(1234)
fit <- aovp(response~trt, data=cholesterol, perm="Prob")
anova(fit)

# 单因素协方差分析的置换检验
library(lmPerm)
set.seed(1234)
fit <- aovp(weight ~ gesttime + dose, data=litter, perm="Prob")
anova(fit)

#12.3.4 双因素方差分析
library(lmPerm)
set.seed(1234)
fit <- aovp(len~supp*dose, data=ToothGrowth, perm="Prob")
anova(fit)

#12.5 自助法
#所谓自助法,即从初始样本重复随机替换抽样,生成一个或一系列待检验统计量的经验分布。
#无需假设一个特定的理论分布,便可生成统计量的置信区间,并能检验统计假设。

#12.6 boot 包中的自助法
install.packages("boot")
#12.6.1 对单个统计量使用自助法

rsq <- function(formula, data, indices) {
d <- data[indices,]
fit <- lm(formula, data=d)
return(summary(fit)$r.square)
}
library(boot)
set.seed(1234)
results <- boot(data=mtcars, statistic=rsq, R=1000, formula=mpg~wt+disp)
print(results)

boot.ci(results, type=c("perc", "bca"))

#12.6.2 多个统计量的自助法
bs <- function(formula, data, indices) {
d <- data[indices,]
fit <- lm(formula, data=d)
return(coef(fit))
}
library(boot)
set.seed(1234)
results <- boot(data=mtcars, statistic=bs, R=1000, formula=mpg~wt+disp)
print(results)