1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
|
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchAttributeException from selenium.common.exceptions import TimeoutException from pyquery.pyquery import PyQuery as pq import pymongo
broswer = webdriver.Firefox() wait = WebDriverWait(broswer,10)
client = pymongo.MongoClient('localhost') db = client['MaFengWoView']
def search_first(url): try: broswer.get(url) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.pi.pg-last'))) total = int(broswer.find_element_by_css_selector('.pi.pg-last').get_attribute('data-page')) print(total) place() return total except TimeoutException: return search_first(url)
def search_next(page): try: next = broswer.find_element_by_css_selector('.pi.pg-next') next.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'.pg-current'),str(page))) place() except NoSuchAttributeException: search_next(page)
def place(): anchors = broswer.find_elements_by_css_selector('.scenic-list.clearfix li a') for a in anchors: url = a.get_attribute('href') print(url) total = view_first(url) for page in range(2,total+1): view_next(page)
def view_first(url): try: broswer.get(url) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.pi.pg-last'))) total = int(broswer.find_element_by_css_selector('.pi.pg-last').get_attribute('data-page')) get_view() return total except TimeoutException: return view_first(url)
def view_next(page): try: next = broswer.find_element_by_css_selector('.pi.pg-next') next.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'.pg-current'),str(page))) get_view() except NoSuchAttributeException: view_next(page)
def get_view(): html = broswer.page_source doc = pq(html) lis = doc('.rev-item.comment-item.clearfix') for li in lis.items(): view = { 'name':li.find('.name').text().strip(), 'level':li.find('.level').text().strip(), 'txt':li.find('.rev-txt').text().strip() } save_to_mongo(view)
def save_to_mongo(view): if db['MaFengWoView'].insert_one(view): print('Saving to MongoDB',view) return True return False
def main(): url = 'http://www.mafengwo.cn/jd/10088/gonglve.html' total = search_first(url) for page in range(2,total+1): search_next(page)
if __name__=='__main__': main()
|