library
(
tidyverse
)
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library
(
gapminder
)
## Warning: package 'gapminder' was built under R version 4.2.2
gapminder
## # A tibble: 1,704 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # ℹ 1,694 more rows
可视化探索
画个简单的图
gapminder
%>%
ggplot
(
aes
(
x
=
log
(
gdpPercap
)
, y
=
lifeExp
)
)
+
geom_point
(
alpha
=
0.2
)
我们想用
不同的模型
拟合
log(gdpPercap)
与
lifeExp
的关联
library
(
colorspace
)
## Warning: package 'colorspace' was built under R version 4.2.2
model_colors
<-
colorspace
::
qualitative_hcl
(
4
, palette
=
"dark 2"
)
# model_colors <- c("darkorange", "purple", "cyan4")
ggplot
(
data
=
gapminder
,
mapping
=
aes
(
x
=
log
(
gdpPercap
)
, y
=
lifeExp
)
geom_point
(
alpha
=
0.2
)
+
geom_smooth
(
method
=
"lm"
,
aes
(
color
=
"OLS"
, fill
=
"OLS"
)
# one
geom_smooth
(
method
=
"lm"
, formula
=
y
~
splines
::
bs
(
x
, df
=
3
)
,
aes
(
color
=
"Cubic Spline"
, fill
=
"Cubic Spline"
)
# two
geom_smooth
(
method
=
"loess"
,
aes
(
color
=
"LOESS"
, fill
=
"LOESS"
)
# three
scale_color_manual
(
name
=
"Models"
, values
=
model_colors
)
+
scale_fill_manual
(
name
=
"Models"
, values
=
model_colors
)
+
theme
(
legend.position
=
"top"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## Coefficients:
## (Intercept) gdpPercap pop continentAmericas
## 4.781e+01 4.495e-04 6.570e-09 1.348e+01
## continentAsia continentEurope continentOceania
## 8.193e+00 1.747e+01 1.808e+01
str
(
out
)
summary
(
out
)
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## Residuals:
## Min 1Q Median 3Q Max
## -49.161 -4.486 0.297 5.110 25.175
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.781e+01 3.395e-01 140.819 < 2e-16 ***
## gdpPercap 4.495e-04 2.346e-05 19.158 < 2e-16 ***
## pop 6.570e-09 1.975e-09 3.326 0.000901 ***
## continentAmericas 1.348e+01 6.000e-01 22.458 < 2e-16 ***
## continentAsia 8.193e+00 5.712e-01 14.342 < 2e-16 ***
## continentEurope 1.747e+01 6.246e-01 27.973 < 2e-16 ***
## continentOceania 1.808e+01 1.782e+00 10.146 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 8.365 on 1697 degrees of freedom
## Multiple R-squared: 0.5821, Adjusted R-squared: 0.5806
## F-statistic: 393.9 on 6 and 1697 DF, p-value: < 2.2e-16
模型的输出结果是一个复杂的list,图
51.1
给出了
out
的结构
图 51.1: 线性模型结果的示意图
我们发现
out
对象包含了很多元素,比如系数、残差、模型残差自由度等等,用读取列表的方法可以直接读取
out
$
coefficients
out
$
residuals
out
$
fitted.values
事实上,前面使用的
suammary()
函数只是选取和打印了
out
对象的一小部分信息,同时这些信息的结构不适合用
dplyr
操作和
ggplot2
画图。
tidy
(
out
)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.78e+1 0.340 141. 0
## 2 gdpPercap 4.50e-4 0.0000235 19.2 3.24e- 74
## 3 pop 6.57e-9 0.00000000198 3.33 9.01e- 4
## 4 continentAmericas 1.35e+1 0.600 22.5 5.19e- 98
## 5 continentAsia 8.19e+0 0.571 14.3 4.06e- 44
## 6 continentEurope 1.75e+1 0.625 28.0 6.34e-142
## 7 continentOceania 1.81e+1 1.78 10.1 1.59e- 23
out
%>%
tidy
(
)
%>%
ggplot
(
mapping
=
aes
(
x
=
term
,
y
=
estimate
)
)
+
geom_point
(
)
+
coord_flip
(
)
可以很方便的获取系数的置信区间
out
%>%
tidy
(
conf.int
=
TRUE
)
## # A tibble: 7 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.78e+1 3.40e-1 141. 0 4.71e+1 4.85e+1
## 2 gdpPercap 4.50e-4 2.35e-5 19.2 3.24e- 74 4.03e-4 4.96e-4
## 3 pop 6.57e-9 1.98e-9 3.33 9.01e- 4 2.70e-9 1.04e-8
## 4 continentAmericas 1.35e+1 6.00e-1 22.5 5.19e- 98 1.23e+1 1.47e+1
## 5 continentAsia 8.19e+0 5.71e-1 14.3 4.06e- 44 7.07e+0 9.31e+0
## 6 continentEurope 1.75e+1 6.25e-1 28.0 6.34e-142 1.62e+1 1.87e+1
## 7 continentOceania 1.81e+1 1.78e+0 10.1 1.59e- 23 1.46e+1 2.16e+1
out
%>%
tidy
(
conf.int
=
TRUE
)
%>%
filter
(
!
term
%in%
c
(
"(Intercept)"
)
)
%>%
ggplot
(
aes
(
x
=
reorder
(
term
,
estimate
)
,
y
=
estimate
, ymin
=
conf.low
, ymax
=
conf.high
)
)
+
geom_pointrange
(
)
+
coord_flip
(
)
+
labs
(
x
=
""
, y
=
"OLS Estimate"
)
augment
augment()
会返回一个数据框,这个数据框是在原始数据框的基础上,增加了模型的拟合值(
.fitted
), 拟合值的标准误(
.se.fit
), 残差(
.resid
)等列。
augment
(
out
)
## # A tibble: 1,704 × 10
## lifeExp gdpPercap pop continent .fitted .resid .hat .sigma .cooksd
## <dbl> <dbl> <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28.8 779. 8425333 Asia 56.4 -27.6 0.00322 8.34 0.00505
## 2 30.3 821. 9240934 Asia 56.4 -26.1 0.00321 8.34 0.00450
## 3 32.0 853. 10267083 Asia 56.5 -24.5 0.00320 8.35 0.00393
## 4 34.0 836. 11537966 Asia 56.5 -22.4 0.00319 8.35 0.00330
## 5 36.1 740. 13079460 Asia 56.4 -20.3 0.00319 8.35 0.00271
## 6 38.4 786. 14880372 Asia 56.5 -18.0 0.00317 8.36 0.00212
## 7 39.9 978. 12881816 Asia 56.5 -16.7 0.00316 8.36 0.00181
## 8 40.8 852. 13867957 Asia 56.5 -15.7 0.00317 8.36 0.00160
## 9 41.7 649. 16317921 Asia 56.4 -14.7 0.00318 8.36 0.00142
## 10 41.8 635. 22227415 Asia 56.4 -14.7 0.00314 8.36 0.00139
## # ℹ 1,694 more rows
## # ℹ 1 more variable: .std.resid <dbl>
out
%>%
augment
(
)
%>%
ggplot
(
mapping
=
aes
(
x
=
lifeExp
, y
=
.fitted
)
)
+
geom_point
(
)
glance
(
out
)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.582 0.581 8.37 394. 3.94e-317 6 -6034. 12084. 12127.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
penguins
%>%
group_nest
(
species
)
%>%
mutate
(
model
=
purrr
::
map
(
data
,
~
lm
(
bill_depth_mm
~
bill_length_mm
, data
=
.
)
)
)
%>%
mutate
(
glance
=
purrr
::
map
(
model
,
~
broom
::
glance
(
.
)
)
)
%>%
tidyr
::
unnest
(
glance
)
## # A tibble: 3 × 15
## species data model r.squared adj.r.squared sigma statistic p.value df
## <fct> <list<ti> <lis> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie [146 × 7] <lm> 0.149 0.143 1.13 25.2 1.51e- 6 1
## 2 Chinst… [68 × 7] <lm> 0.427 0.418 0.866 49.2 1.53e- 9 1
## 3 Gentoo [119 × 7] <lm> 0.428 0.423 0.749 87.5 7.34e-16 1
## # ℹ 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## # df.residual <int>, nobs <int>
fit_ols
<-
function
(
df
)
{
lm
(
body_mass_g
~
bill_depth_mm
+
bill_length_mm
, data
=
df
)
out_tidy
<-
penguins
%>%
group_nest
(
species
)
%>%
mutate
(
model
=
purrr
::
map
(
data
,
fit_ols
)
)
%>%
mutate
(
tidy
=
purrr
::
map
(
model
,
~
broom
::
tidy
(
.
)
)
)
%>%
tidyr
::
unnest
(
tidy
)
%>%
dplyr
::
filter
(
!
term
%in%
"(Intercept)"
)
out_tidy
## # A tibble: 6 × 8
## species data model term estimate std.error statistic p.value
## <fct> <list<tibble[,7]>> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie [146 × 7] <lm> bill… 164. 25.1 6.51 1.17e-9
## 2 Adelie [146 × 7] <lm> bill… 64.8 11.5 5.64 8.88e-8
## 3 Chinstrap [68 × 7] <lm> bill… 159. 43.3 3.67 4.98e-4
## 4 Chinstrap [68 × 7] <lm> bill… 23.8 14.7 1.62 1.11e-1
## 5 Gentoo [119 × 7] <lm> bill… 255. 40.0 6.37 4.01e-9
## 6 Gentoo [119 × 7] <lm> bill… 54.7 12.7 4.30 3.54e-5
out_tidy
%>%
ggplot
(
aes
(
x
=
species
, y
=
estimate
,
ymin
=
estimate
-
2
*
std.error
,
ymax
=
estimate
+
2
*
std.error
,
color
=
term
)
)
+
geom_pointrange
(
position
=
position_dodge
(
width
=
0.25
)
)
+
theme
(
legend.position
=
"top"
)
+
labs
(
x
=
NULL
, y
=
"Estimate"
, color
=
"coef"
)