猫眼电影 Top100
思路 获取单页网页源代码并返回源代码 解析单页网页源代码,提取 title、actor、time、score 等数据并存储为生成器 将生成器里每一部电影的数据写入txt文档中 研究第 1-10 页 url 的规律,构建 url,调用 1、2、3 步骤 步骤 获取单页网页源代码并返回源代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import  requestsimport  reimport  jsonfrom  requests.exceptions import  RequestExceptiondef  get_one_page (url ):    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' }     try :         response = requests.get(url,headers = headers)         if  response.status_code == 200 :             html = response.text             return  html         return  None      except  RequestException:         return  None  
1 print (get_one_page('http://maoyan.com/board/4' ))
解析单页网页源代码,提取title、actor、time、score数据并存储为生成器
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def  parse_one_page (html ):    pattern = re.compile ('<dd>.*?board-index.*?>(\d+)</i>'      + '.*?<p.*?title="(.*?)".*?</p>.*?star">(.*?)</p>'      + '.*?releasetime">(.*?)</p>.*?integer">(.*?)'      + '<.*?fraction">(.*?)</i>' ,re.S)     movies = re.findall(pattern,html)     for  item in  movies:         yield {             '排名' :item[0 ],             '电影名' :item[1 ],             '主演' :item[2 ].strip()[3 :],             '上映时间' :item[3 ][5 :],             '评分' :item[4 ]+item[5 ]         } 
注意一下,这里需要用 yield,而不是return。yield函数返回的是一个生成器(一种特殊的迭代器,可以用for循环进行遍历) 如果用return,那么在第一轮循环结束就会跳出,只能获取到一部影片的信息
将生成器数据写入 txt 文档
1 2 3 4 5 6 def  write_to_txt (content ):         with  open ('movies.txt' ,'a' ,encoding='utf8' ) as  f:                  f.write(json.dumps(content,ensure_ascii=False ) + '\n' )         f.close() 
研究第1-10页
1 2 3 4 5 6 7 8 for  i in  range (0 ,10 ):    url = 'https://maoyan.com/board/4?offset='  + str (i * 10 )          html = get_one_page(url)     movies= parse_one_page(html)     for  item in  movies:         write_to_txt(item) 
多线程保持为 txt 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 import  requestsimport  reimport  jsonfrom  multiprocessing import  Poolfrom  requests.exceptions import  RequestExceptiondef  get_one_page (url ):    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' }     try :         response = requests.get(url,headers = headers)         if  response.status_code == 200 :             html = response.text             return  html         return  None      except  RequestException:         return  None  
1 2 3 4 5 6 7 8 9 10 11 12 def  main (offset ):    url = 'http://maoyan.com/board/4?offset='  + str (offset)     html = get_one_page(url)     for  item in  parse_one_page(html):         print (item)         write_to_txt(item) if  __name__ == '__main__' :    pool = Pool()      pool.map (main, [i*10  for  i in  range (10 )])     pool.close()     pool.join() 
保持为 CSV 单线程 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 import  requestsimport  reimport  jsonimport  pandasfrom  requests.exceptions import  RequestExceptiondef  get_one_page (url ):    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' }     response = requests.get(url,headers = headers)     html = response.text     return  html def  parse_one_page (html ):    pageary = []     pattern = re.compile ('<dd>.*?board-index.*?>(\d+)</i>'      + '.*?<p.*?title="(.*?)".*?</p>.*?star">(.*?)</p>'      + '.*?releasetime">(.*?)</p>.*?integer">(.*?)'      + '<.*?fraction">(.*?)</i>' ,re.S)     movies = re.findall(pattern,html)     for  item in  movies:         dict  = {             '电影名' :item[0 ],             '主演' :item[1 ].strip()[3 :],             '上映时间' :item[2 ][5 :],             '评分' :item[3 ]+item[4 ]         }         pageary.append(dict )     return  pageary ary = [] for  i in  range (0 ,10 ):    url = 'https://maoyan.com/board/4?offset='  + str (i * 10 )     html = get_one_page(url)     pageary = parse_one_page(html)     ary = ary + pageary df = pandas.DataFrame(ary) df.to_csv('movies.csv' ) 
多线程 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 import  requestsimport  reimport  jsonimport  pandasfrom  multiprocessing import  Poolfrom  requests.exceptions import  RequestExceptiondef  get_one_page (url ):    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' }     try :         response = requests.get(url,headers = headers)         if  response.status_code == 200 :             html = response.text             return  html         return  None      except  RequestException:         return  None  def  parse_one_page (html ):    pageary = []     pattern = re.compile ('<dd>.*?board-index.*?>(\d+)</i>'      + '.*?<p.*?title="(.*?)".*?</p>.*?star">(.*?)</p>'      + '.*?releasetime">(.*?)</p>.*?integer">(.*?)'      + '<.*?fraction">(.*?)</i>' ,re.S)     movies = re.findall(pattern,html)     for  item in  movies:         dict  = {             '排名' :item[0 ],             '电影名' :item[1 ],             '主演' :item[2 ].strip()[3 :],             '上映时间' :item[3 ][5 :],             '评分' :item[4 ]+item[5 ]         }         pageary.append(dict )     return  pageary def  write_to_csv (pageary ):    ary = []     ary = ary + pageary     df = pandas.DataFrame(ary)     df.to_csv('movies.csv' ) def  main (offset ):    url = 'https://maoyan.com/board/4?offset='  + str (offset)     pageary = parse_one_page(html)     write_to_csv(ary) if  __name__ == '__main__' :    pool = Pool()     pool.map (main, [i*10  for  i in  range (10 )])     pool.close()     pool.join()