admin管理员组

文章数量:1487745

python爬取北京公交数据

代码语言:python代码运行次数:0运行复制
# -*- coding = utf-8 -*-
# @Author :ZDHXN
# @File :beijingbus.py
# @Software : PyCharm
import csv
import urllib.request
from time import sleep

from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
# 存放所有数据列表
all_data_list = []


# 获取公交路线详情页url
def get_page_url(urls):
    req = urllib.request.Request(urls, headers=headers)
    html = urllib.request.urlopen(req)
    soup = bs(html.read(), 'html.parser')
    lu = soup.find('div', class_='list clearfix')
    hrefs = lu.find_all('a')
    for k in hrefs:
        urls = urljoin(url, k['href'])
        get_page_info(urls)


# 获取公交路线详情页目标数据信息
def get_page_info(urls):
    req = urllib.request.Request(urls, headers=headers)
    html = urllib.request.urlopen(req)
    soup = bs(html.read(), 'html.parser')
    # 使用BeautifulSoup的select()方法
    # 线路类型
    line_type = soup.select('div.layout-left > div > div.info > h1 > a')[0].string
    try:
        # 总里程
        mileage = soup.select('div.layout-left > div.change-info.mb20')[0].string
    except:
        mileage = ""

    # 使用BeautifulSoup的find()、find_all()方法爬取更多的相关数据
    # 线路名称
    line_name = soup.find("h1", {"class": "title"}).a.previous_sibling.string
    info_list = soup.find("ul", {"class": "bus-desc"})
    # 运行时间
    run_time = info_list.find_all("li")[0].string
    # 参考票价
    ticket = info_list.find_all("li")[1].string
    # 公交公司
    company = info_list.find_all("li")[2].text
    # 最后更新
    update_last = info_list.find_all("li")[3].div.previous_sibling.string

    line_name_list = soup.find_all("div", {"class": "trip"})
    line_list = soup.find_all("div", {"class": "bus-lzlist mb15"})

    wang_line_list = []
    fan_line_list = []

    wang_line_name = ""
    fan_line_name = ""

    for i in range(len(line_list)):
        if i == 0:
            wang_line_list = line_list[0].find_all(["li"])
            wang_line_name = line_name + "(" + line_name_list[0].string + ")"
        else:
            fan_line_list = line_list[1].find_all(["li"])
            fan_line_name = line_name + "(" + line_name_list[1].string + ")"

    # 公交路线-往(环形线默认为此项)
    wang_info = wang_line_name + "\n"
    # 公交路线-返
    fan_info = fan_line_name + "\n"

    for i in range(len(wang_line_list)):
        if i != (len(wang_line_list) - 1):
            if wang_line_list[i].find_all(["a"]) != []:
                for k in wang_line_list[i].find_all(["a"]):
                    if k.get('title'):
                        continue
                    else:
                        wang_info += wang_line_list[i].find_all(["a"])[0].string + ","
        else:
            wang_info += wang_line_list[i].string
    if len(fan_line_list) != 0:
        for i in range(len(fan_line_list)):
            if i != (len(fan_line_list) - 1):
                if fan_line_list[i].find_all(["a"]) != []:
                    for k in fan_line_list[i].find_all(["a"]):
                        if k.get('title'):
                            continue
                        else:
                            fan_info += fan_line_list[i].find_all(["a"])[0].string + ","
            else:
                fan_info += fan_line_list[i].string

    result_list = [line_name, line_type, run_time, mileage, ticket, company, update_last, wang_info, fan_info]
    all_data_list.append(result_list)


# 将数据存储到mysql



if __name__ == '__main__':
    url = ''
    url_list = url + '/list%d'
    for k in range(1, 10):
        urls = url_list % k
        get_page_url(urls)

    # 存储到csv文件
    field_name = ["线路名称", "线路类型", "运行时间", "总里程", "参考票价", "公交公司", "最后更新", "公交路线-往(环形线默认为此项)", "公交路线-返"]
    path = "f:/data/test/bus_info.csv"
    with open(path, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(field_name)
        writer.writerows(all_data_list)

    # 存储到mysql数据库

本文标签: python爬取北京公交数据