本文共 3373 字,大约阅读时间需要 11 分钟。
爬取老毛桃网站并将数据写入MS SQL SERVER数据库
要实现爬取老毛桃网站并将数据写入MS SQL SERVER数据库,我们可以按照以下步骤进行操作:
pip install requestspip install beautifulsoup4
import requestsfrom bs4 import BeautifulSoupimport csvimport randomimport timeimport socketimport http.client# SQL连接信息,记得替换为实际数据库信息def insert_data(data): db = pymssql.connect(host='localhost', user='sa', password='your_password', database='your_database', port=1433) cursor = db.cursor() sql = "INSERT INTO t_tq (rq, tq, zgwd, zdwd) VALUES (%s, %s, %s, %s)" try: cursor.execute(sql, (data['rq'], data['tq'], data['zgwd'], data['zdwd'])) db.commit() return data except Exception as e: print(f"Error: {e}") return Nonedef get_page_content(url): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } timeout = random.randint(80, 180) while True: try: response = requests.get(url, headers=headers, timeout=timeout) response.encoding = 'utf-8' return response.text except socket.timeout as e: print(f"Socket timeout: {e}") time.sleep(random.randint(8, 15)) except requests.exceptions.RequestException as e: print(f"Request error: {e}") time.sleep(random.randint(5, 10))def parse_data(html_text): item_data = { 'rq': None, 'tq': None, 'zgwd': None, 'zdwd': None } soup = BeautifulSoup(html_text, 'html.parser') container = soup.find('div', {'id': '7d'}) if not container: return None ul = container.find('ul') if not ul: return None lis = ul.find_all('li') for li in lis: date = li.find('h1').get_text(strip=True) item_data['rq'] = date paragraphs = li.find_all('p') if not paragraphs: continue current_p = paragraphs[0] item_data['tq'] = current_p.get_text(strip=True) if len(paragraphs) >= 2: next_p = paragraphs[1] if next_p.find('span'): highest_temp = next_p.find('span').get_text(strip=True) if highest_temp: item_data['zgwd'] = highest_temp.replace('℃', '') elif next_p.find('i'): lowest_temp = next_p.find('i').get_text(strip=True) if lowest_temp: item_data['zdwd'] = lowest_temp.replace('℃', '') return item_datadef main(): url = 'http://www.weather.com.cn/weather/101190401.shtml' try: html = get_page_content(url) data = parse_data(html) if data: insert_data(data) except Exception as e: print(f"Error: {e}")if __name__ == "__main__": main() host:数据库主机地址user:数据库用户名password:数据库密码database:数据库名称port:数据库端口运行代码在PyCharm中运行上述脚本,代码将爬取指定的老毛桃网站页面,解析数据并写入MS SQL SERVER数据库。
处理潜在问题
通过以上步骤,你可以实现爬取老毛桃网站的数据并将其存储到MS SQL SERVER数据库中。
转载地址:http://sjbr.baihongyu.com/