1. 页面分析
① 获取每支球员页面的url; ② 利用Python代码获取每个网页中的数据; ③ 将获取到的数据,存储至不同的数据库;
# 76人
https://nba.stats.qq.com/player/list.htm#teamId=20
# 火箭
https://nba.stats.qq.com/player/list.htm#teamId=10
# 热火
https://nba.stats.qq.com/player/list.htm#teamId=14
2. 数据爬取
from selenium import webdriver
# 创建浏览器对象,该操作会自动帮我们打开Google浏览器窗口
browser = webdriver.Chrome()
# 调用浏览器对象,向服务器发送请求。该操作会打开Google浏览器,并跳转到“百度”首页
browser.get("https://nba.stats.qq.com/player/list.htm#teamId=20")
# 最大化窗口
browser.maximize_window()
# 获取球员中文名
chinese_names = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[2]/a')
chinese_names_list = [i.text for i in chinese_names]
# 获取球员英文名
english_names = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[3]/a')
english_names_list = [i.get_attribute('title') for i in english_names] # 获取属性
# 获取球员号码
numbers = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[4]')
numbers_list = [i.text for i in numbers]
# 获取球员位置
locations = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[5]')
locations_list = [i.text for i in locations]
# 获取球员身高
heights = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[6]')
heights_list = [i.text for i in heights]
# 获取球员体重
weights = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[7]')
weights_list = [i.text for i in weights]
# 获取球员年龄
ages = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[8]')
ages_list = [i.text for i in ages_list]
# 获取球员球龄
qiu_lings = browser.find_elements_by_xpath('//div[@class="players"]//tr[@class="show"]/td[9]')
qiu_lings_list = [i.text for i in qiu_lings_list]
3. 存储至txt
for i in zip(chinese_names_list,english_names_list,numbers_list,locations_list,heights_list,weights_list,ages_list,qiu_lings_list):
with open("NBA.txt","a+",encoding="utf-8") as f:
# zip函数,得到的是一个元组,我们需要将它转换为一个字符串
f.write(str(i)[1:-1])
# 自动换行,好写入第2行数据
f.write("\n")
f.write("\n")
4. 存储至excel
import pandas as pd
# 一定要学会组织数据
df = pd.DataFrame({"中文名": chinese_names_list,
"英文名": english_names_list,
"球员号码": numbers_list,
"位置": locations_list,
"身高": heights_list,
"体重": weights_list,
"年龄": ages_list,
"球龄": qiu_lings_list})
# to_excel()函数
df.to_excel("NBA.xlsx",encoding="utf-8",index=None)
5. 存储至mysql
① 创建一个表nba
import pymysql
# 1. 连接数据库
db = pymysql.connect(host='localhost',user='root', password='123456',port=3306, db='demo', charset='utf8')
# 2. 创建一个表
# 创建一个游标对象;
cursor = db.cursor()
# 建表语句;
sql = """
create table NBA(
chinese_names_list varchar(20),
english_names_list varchar(20),
numbers_list varchar(20),
locations_list varchar(20),
heights_list varchar(20),
weights_list varchar(20),
ages_list varchar(20),
qiu_lings_list varchar(20)
)charset=utf8
"""
# 执行sql语句;
cursor.execute(sql)
# 断开数据库的连接;
db.close()
② 往表nba中插入数据
import pymysql
# 1. 组织数据
data_list = []
for i in zip(chinese_names_list,english_names_list,numbers_list,locations_list,heights_list,weights_list,ages_list,qiu_lings_list):
data_list.append(i)
# 2. 连接数据库
db = pymysql.connect(host='localhost',user='root', password='123456',port=3306, db='demo', charset='utf8')
# 创建一个游标对象;
cursor = db.cursor()
# 3. 插入数据
sql = 'insert into nba(chinese_names_list,english_names_list,numbers_list,locations_list,heights_list,weights_list,ages_list,qiu_lings_list) values(%s,%s,%s,%s,%s,%s,%s,%s)'
try:
cursor.executemany(sql,data_list)
db.commit()
print("插入成功")
except:
print("插入失败")
db.rollback()
db.close()
暂无数据