header
=
{
'user-agent'
:
'你自己的浏览器信息'
}
try
:
response
=
requests
.
get
(
url
,
headers
=
header
)
if
response
.
status_code
==
200
:
return
response
.
content
.
decode
(
'gbk'
)
else
:
print
(
"{0}网页请求状态码错误!{0}"
.
format
(
"-"
*
10
)
)
except
Exception
as
e
:
print
(
"{0}请求参数出现错误:{1}{0}"
.
format
(
"-"
*
10
,
e
)
)
def
parse_url
(
url
,
xpath_path
)
:
html
=
get_html
(
url
)
next_base_url
=
"/"
.
join
(
url
.
split
(
"/"
)
[
:
-
1
]
)
HTML
=
etree
.
HTML
(
html
)
all_area
=
HTML
.
xpath
(
f'
{
xpath_path
}
/text()'
)
next_link
=
HTML
.
xpath
(
f'
{
xpath_path
}
/@href'
)
return
[
(
i
[
0
]
,
next_base_url
+
"/"
+
i
[
1
]
)
for
i
in
list
(
zip
(
all_area
,
next_link
)
)
]
def
parse_url2
(
url
,
xpath_path
)
:
"""最后一级,无跳转链接"""
html
=
get_html
(
url
)
HTML
=
etree
.
HTML
(
html
)
villagetr
=
HTML
.
xpath
(
f'
{
xpath_path
}
/text()'
)
return
villagetr
result
=
[
]
xpath_path
=
'//tr[@class="countytr"]/td[2]/a'
url
=
"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/34/3401.html"
for
i
in
parse_url
(
url
,
xpath_path
)
:
area1
,
url
=
i
xpath_path
=
'//tr[@class="towntr"]/td[2]/a'
for
j
in
parse_url
(
url
,
xpath_path
)
:
area2
,
url
=
j
xpath_path
=
'//tr[@class ="villagetr"]/td[3]'
for
k
in
parse_url2
(
url
,
xpath_path
)
:
result
.
append
(
[
area1
,
area2
,
k
]
)
df
=
pd
.
DataFrame
(
result
,
columns
=
[
"区"
,
"镇/街道"
,
"居委会"
]
)
df
.
to_excel
(
"合肥市行政区域划分.xlsx"
,
index
=
False
)
【版权声明】本文为华为云社区用户原创内容,未经允许不得转载,如需转载请自行联系原作者进行授权。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
[email protected]