Scrapy anjuke.com 广州二手房数据(保存为 csv)
Spiders 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 from scrapy import Spider,Requestfrom anjuke.items import AnjukeItemclass AnjukehouseSpider (Spider ): name = 'anjukeHouse' allowed_domains = ['anjuke.com' ] start_urls = ['https://guangzhou.anjuke.com/sale/p1-rd1/#filtersort' ] def parse (self, response ): urls = response.xpath('//div[@class="house-title"]/a/@href' ).extract() for url in urls: yield Request(url,callback=self .parse_detail) next = response.xpath('//*[@id="content"]/div[4]/div[7]/a[7]/@href' ).extract() if next : next = response.urljoin(next [0 ]) yield Request(next ,callback=self .parse) def parse_detail (self,response ): item = AnjukeItem() item['date' ] = response.xpath('//span[@class="house-encode"]/text()' ).extract()[0 ].split() item['tittle' ] = response.xpath('//h3[@class="long-title"]/text()' ).extract() item['price' ] = response.xpath('//span[@class="light info-tag"]/em/text()' ).extract_first().split() houseInfo = response.xpath('//div[@class="houseInfo-content"]/text()' ).extract() item['huxing' ] = houseInfo[2 ].strip().replace("\n" ,"" ).replace("\t" ,"" ).split() item['area' ] = houseInfo[7 ].strip().split() item['built' ] = houseInfo[9 ].strip().replace("\n" , "" ).replace("\t" , "" ).split() item['chaoxiang' ] = houseInfo[10 ].strip().split() item['leixing' ] = houseInfo[-8 ].strip().split() item['louceng' ] = houseInfo[-7 ].strip().split() item['zhuangxiu' ] = houseInfo[-6 ].strip().split() print (item) return item
items 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from scrapy import Item,Fieldclass AnjukeItem (Item ): tittle = Field() huxing = Field() area = Field() chaoxiang = Field() louceng = Field() price = Field() zhuangxiu = Field() leixing = Field() date = Field() built = Field()
settings 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 BOT_NAME = 'anjuke' SPIDER_MODULES = ['anjuke.spiders' ] NEWSPIDER_MODULE = 'anjuke.spiders' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 3 DEFAULT_REQUEST_HEADERS = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'en' , 'user-agent' :'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' , 'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' , 'accept-encoding' : 'gzip, deflate, br' , 'accept-language' : 'zh-CN,zh;q=0.9' , 'cache-control' : 'max-age=0' , } ITEM_PIPELINES = { 'anjuke.pipelines.Pipeline_ToCSV' : 300 ,}
pipelines 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 import csvimport osclass Pipeline_ToCSV (object ): def __init__ (self ): self .csvwriter = csv.writer(open ('anjuke.csv' , 'w' ), delimiter=',' ) self .csvwriter.writerow(['date' ,'tittle' , 'price' , 'huxing' , 'area' ,'built' ,'chaoxiang' ,'leibie' ,'loucheng' ,'zhuangxiu' ]) def process_item (self, item, ampa ): rows = zip (item['date' ],item['tittle' ], item['price' ],item['huxing' ],item['area' ],item['built' ],item['chaoxiang' ],item['leixing' ],item['louceng' ],item['zhuangxiu' ]) for row in rows: self .csvwriter.writerow(row) return item
Selenium 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom bs4 import BeautifulSoupimport pandasimport time,randombroswer = webdriver.Chrome() wait = WebDriverWait(broswer,10 ) houseInfo = [] def get_urls (): page_urls = [] star_url = 'https://guangzhou.anjuke.com/sale/p' for i in range (1 ,51 ): url = star_url+str (i) page_urls.append(url) return page_urls def HouseUrl (url ): time.sleep(random.random()*10 ) broswer.get(url) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.curr' ))) urls = broswer.find_elements_by_css_selector('.houseListTitle' ) house_urls = [] for a in urls: urls = a.get_attribute('href' ) house_urls.append(urls) return house_urls def get_detail (url ): time.sleep(random.random()*10 ) broswer.get(url) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.houseInfoBox' ))) info = {} html = broswer.page_source soup = BeautifulSoup(html,'lxml' ) info['标题' ] = soup.select('.long-title' )[0 ].text info['总价' ] = soup.select('.basic-info span' )[0 ].text info['户型' ] = soup.select('.basic-info span' )[1 ].text info['面积' ] = soup.select('.basic-info span' )[2 ].text info['单价' ] = soup.select('.houseInfo-content' )[2 ].text info['朝向' ] = soup.select('.houseInfo-content' )[7 ].text info['月供' ] = soup.select('.houseInfo-content' )[8 ].text info['楼层' ] = soup.select('.houseInfo-content' )[-7 ].text info['装修' ] = soup.select('.houseInfo-content' )[-6 ].text k = ['标题' ,'总价' ,'户型' ,'面积' ,'单价' ,'朝向' ,'月供' ,'楼层' ,'装修' ] info_adj = dict (zip (k,list (info.values()))) houseInfo.append(info_adj) print (houseInfo) return houseInfo def save_to_csv (houseInfo ): df = pandas.DataFrame(houseInfo) df.to_csv('maoming.csv' ) def main (): page_urls = get_urls() for url in page_urls: house_urls = HouseUrl(url) for url in house_urls: houseInfo=get_detail(url) save_to_csv(houseInfo) if __name__ == '__main__' : main()