爬虫学习用xpath爬取链家二手房信息

学习xpath爬取杭州链家二手房

先导入需要的库

# -*- coding: UTF-8 -*- 
import requests
from lxml import etree
import pandas as pd
import time
import os

分析链家杭州二手房的网页信息：
有一百页的信息并无反爬虫限制，可以直接爬取。本次就是爬取这一百页。使用request库

#仿照浏览器访问
def gethtml(url):
        try:
                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'}
                r = requests.get(url, headers=headers)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                return r.text
        except:
                print('error')

#使用xpath提取网页信息
def getinfo(html):
        allstring=etree.HTML(html)
        community = allstring.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a/text()')  
        district= []
        flat_type = []
        size = []
        decoration=[]
        info=allstring.xpath("//div[@class='info']")
        for house in info:
                district1=house.xpath('.//text()')[0]
                flat_type1= house.xpath('.//text()')[2]
                size1= house.xpath('.//text()')[4]
                decoration1 = house.xpath('.//text()')[8]
                district.append(district1)
                flat_type.append(flat_type1)
                size.append(size1)
                decoration.append(decoration1)
        location = allstring.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[3]/div/text()')
        price = allstring.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()')
        price_persquare = allstring.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()')
        #最后return一个pandas dataframe
        table = {"区域":district, "小区":community,"户型":flat_type,"面积":size,"装修":decoration,"层数":location,"总价":price, "单价":price_persquare}
        # df= pd.DataFrame.from_dict(table)
        return table

其中直接快速获得元素xpath的小技巧：
在浏览器页面中选中元素，比如房子价格，右键检查，然后右键，copy to xpath。注意，如想选中一系列的价格，需要看多个房子的价格，并且找到规律。
比如：链家二手房中，第30个房子价格xpath:

//*[@id="content"]/div[1]/ul/li[30]/div[1]/div[6]/div[1]/span
第29个：
//*[@id="content"]/div[1]/ul/li[29]/div[1]/div[6]/div[1]/span
...
那么，price的xpath就是：
//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span
用这个方法可以返回当页价格的list

Alt text

#因为一共有200个界面，100个dataframe需要被append到一起
def append_csv(table):
        filename = "ershoufang.csv"
        df_new = pd.DataFrame.from_dict(table)
        #判断是否存在，如果存在，则在原来的基础上加新的列
        if(os.path.exists(filename)):
                table_new = pd.read_csv(filename)
                df_0 = pd.DataFrame.from_dict(table_new)
                frames=[df_0,df_new]
                df=pd.concat(frames, sort=False)
                df.to_csv(filename)
        else:
                df_new.to_csv(filename)
def main():
        start_url = "https://hz.lianjia.com/ershoufang/pg{0}/"
        for i in range(1,101):
                url = start_url.format(i)
                html=gethtml(url)
                table = getinfo(html)
                append_csv(table)
                time.sleep(3)
main()

最后的结果：
Alt text