defwrite_to_txt(content): # 采用 append 追加模式,字符集为utf8 with open('movies.txt','a',encoding='utf8') as f: # 采用json的dumps方法来初始化字符串 f.write(json.dumps(content,ensure_ascii=False) + '\n') f.close()
研究第1-10页
1 2 3 4 5 6 7 8
# 第1-10页url for i in range(0,10): url = 'https://maoyan.com/board/4?offset=' + str(i * 10) # 构建 url,调用1、2、3步骤 html = get_one_page(url) movies= parse_one_page(html) for item in movies: write_to_txt(item)
多线程保持为 txt
第一步
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
import requests import re import json from multiprocessing import Pool from requests.exceptions import RequestException
defget_one_page(url): headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'} try: response = requests.get(url,headers = headers) if response.status_code == 200: html = response.text return html returnNone except RequestException: returnNone
第二步
1 2 3 4 5 6 7 8 9 10 11 12
defmain(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_txt(item)
if __name__ == '__main__': pool = Pool() # 多线程 pool.map(main, [i*10for i in range(10)]) pool.close() pool.join()