Import modules

In [1]:
import numpy as np
import pandas as pd
import pickle

import bs4 as bs
import urllib.request
import json
from bs4 import BeautifulSoup
import requests
import re
from collections import Counter

import MySQLdb, pickle
from sqlalchemy import create_engine

Crawling data from Cars.com

In [2]:
# Create a data frame to store crawling data
df = pd.DataFrame(columns = ['year', 'title','brand','model','miles','photos','video','exterior_color','interior_color','transmission','drivetrain','star','review_no','vendor','price'])

# Getting data from cars.com
for page in range(1,51):
    # Cars.com URL
    url = 'https://www.cars.com/for-sale/searchresults.action/?page='+str(page)+'&perPage=100&rd=99999&searchSource=PAGINATION&showMore=true&sort=relevance&stkTypId=28881&zc=31216'

    # Requesting data 
    sauce = urllib.request.urlopen(url).read()
    
    # Parsing data with lxml
    soup = bs.BeautifulSoup(sauce, 'lxml')
    
    # All of the data comes with listing-row__details as the class name
    specificSoup = soup.find_all('div', class_='listing-row__details')

    
    # Put the imported data in the data frame
    for div in specificSoup:
        
        # Put information about the name of the car in the name variable
        name_index = div.find('h2', {'class' :'cui-delta listing-row__title'}).text
        name = name_index.split("\n")[1]

        # Get only the car's year out of the car information in the name variable
        year_index = re.findall('[0-9]{4}',name)[0:1]
        year = year_index[0]

        # Get only the car's title out of the car information in the name variable
        title_index = name.split(" ")[29:34]
        title = " ".join(title_index)

        # Get the car's brand out of the car information in the title variable
        brand = title.split(" ")[0]
       
        # If there hasn't model information, brand will be as a model
        try:
            model = title.split(" ")[1]
        except:
            model = brand

        # The miles of the car 
        mile_index = div.find('span', {'class' : 'listing-row__mileage'}).text
        miles = mile_index.split("\n")[0]
        regex = re.compile("\d+")
        miles = regex.findall(miles) 
        miles = ''.join(miles)

        # The vendor information
        vendor_index =div.find('div',{'class' : 'listing-row__dealer-name listing-row__dealer-name-mobile'}).text
        vendor_group = vendor_index.split(' ')[0:4]

        vendor = ''
        for i in vendor_group:
            if i == '':continue
            elif re.findall('\n', i): vendor += re.sub('\n', '', i)
            else: vendor += " "+ i +" "

        # Number of photos
        photos_index = div.find('div', {'class' : 'media-count shadowed'}).text
        photos = re.findall('[0-9]{1,3}',photos_index.split("\n")[1])[0]

        # Number of video
        video_index = div.find('div', {'class' : 'media-count shadowed'}).text
        try: 
            video = re.findall('[0-9]{1,3}',photos_index.split("\n")[2])[0]
        except:
            video = 0

        # Exterior color
        exterior_color = div.find('ul', {'class' : 'listing-row__meta'}).text
        try:
            exterior_color = re.sub('\n', ' ',exterior_color).split(" ")[4]
        except:
            exterior_color = 'black'
        
        # Interior color
        interior_color = div.find('ul', {'class' : 'listing-row__meta'})
        try:
            interior_color = list(interior_color)[3].text.split(" ")[3:5]
        except:
            interior_color = "black"
        interior_color = " ".join(interior_color)
        interior_color = re.sub('/' , ' ' ,interior_color)

        # Transmission type
        transmission = div.find('ul', {'class' : 'listing-row__meta'})
        try:
            transmission = list(transmission)[5].text.split(" ")[2]
        except:
            transmission = "6-speed"
        
        # Drivetrain type
        drivetrain = div.find('ul', {'class' : 'listing-row__meta'})
        try:
            drivetrain = list(drivetrain)[7].text.split(" ")[2:5]
        except: 
            drivetrain = 'fwd'
        drivetrain = " ".join(drivetrain).lower()
        if drivetrain == 'rear wheel drive':
            drivetrain = 'rwd'
        elif drivetrain == 'front wheel drive':
            drivetrain = 'fwd'
        elif drivetrain == 'all wheel drive':
            drivetrain = '4wd'

        # Number of star
        if div.find('div',{'class' : 'dealer-rating-stars'}) == None:
            star = 0
        else:
            star_index =div.find('div',{'class' : 'dealer-rating-stars'}).text
            star = star_index.split(" ")[36]
            regex = re.compile("\d")
            star = regex.findall(star)[0] 

        # Number of review
        if div.find('span',{'class' : 'listing-row__review-number'}) == None:
            review_no = 0
        else:
            review_index =div.find('span',{'class' : 'listing-row__review-number'}).text
            review_no = re.sub('\n', '',review_index.split(" ")[1])

        # Car price
        if div.find('span', {'class' : 'listing-row__price'}) == None:
            price = 0
        else:
            price_index = div.find('span', {'class' : 'listing-row__price'}).text
            price = price_index.split("\n")[1]
            regex = re.compile("\d")
            price = ''.join(regex.findall(price))
            price


        # Entering crawled data into a data frame
        data = { 
                'year' : year,
                'title' : title.lower(),
                'brand': brand.lower(),
                'model': model.lower(),
                'miles' : miles,
                'photos': photos,
                'video' : video,
                'exterior_color' : exterior_color.lower(),
                'interior_color' : interior_color.lower(),
                'transmission' : transmission.lower(),
                'drivetrain' : drivetrain.lower(),
                'star': star,
                'review_no' : review_no,

                'vendor' : vendor.lower(),
                'price': price,
                    }


        df.loc[len(df)] = data
# Data Preprocessing   
df['transmission'] = df['transmission'].apply(lambda x: '6-speed' if x == 'automatic' or x == '6' else x)
df['transmission'] = df['transmission'].apply(lambda x: '8-speed' if x == '8' else x)
df['transmission'] = df['transmission'].apply(lambda x: '5-speed' if x == '5' else x)   
df['transmission'] = df['transmission'].apply(lambda x: '5-speed' if x == '5' else x)   
df['transmission'] = df['transmission'].apply(lambda x: 'x-speed' if x != '1-speed' and x != '2-speed' and x != '3-speed' and \
                                              x != '4-speed' and x != '5-speed' and x != '6-speed' and x != '7-speed' and x != '7-speed' and\
                                              x != '8-speed' and x != '9-speed' and x != '10-speed' \
                                              else x)
df['drivetrain'] = df['drivetrain'].apply(lambda x: '4wd' if x == 'four wheel drive' or x == '4wd' or x=='4x4'or x=='awd'else x)
df['drivetrain'] = df['drivetrain'].apply(lambda x: 'fwd' if x == '2wd' or x=='f w d' else x)
df['drivetrain'] = df['drivetrain'].apply(lambda x: '4wd' if x != 'fwd' and  x!='rwd' and x!='4wd' else x)

df.head()
Out[2]:
year title brand model miles photos video exterior_color interior_color transmission drivetrain star review_no vendor price
0 2017 nissan gt-r premium nissan gt-r 7365 32 1 orange orange 6-speed 4wd 5 7 nexus auto brokers 89900
1 2017 nissan gt-r premium nissan gt-r 6232 32 1 black orange 6-speed 4wd 5 7 nexus auto brokers 89900
2 2017 nissan gt-r premium nissan gt-r 2589 26 1 blue orange 6-speed 4wd 5 7 nexus auto brokers 89900
3 2010 bmw 328 i bmw 328 95705 28 1 black black 6-speed rwd 5 24 burnoutgarage 10995
4 2015 bmw 435 i bmw 435 61131 32 1 alpine black 8-speed rwd 5 26 inetwork auto group 23998
In [37]:
# Exclude prices and non-branded data
df = df[df["price"] != ""]
df = df[df["brand"] != ""]
In [40]:
# Transforming data for data processing
df["year"] = df["year"].astype('int')
df["miles"] = df["miles"].astype('int')
df["photos"] = df["photos"].astype('int')
df["video"] = df["video"].astype('int')
df["star"] = df["star"].astype('int')
df["review_no"] = df["review_no"].astype('int')
df["price"] = df["price"].astype('int')

Put Crawled data into Database (MySQL) at Amazon web service cloud

In [43]:
# Load pickle file as database password
pw = pickle.load(open('./Data/pw.plk','rb'))
In [45]:
# Saving data to the database(MySQL at Amazon Web Service Cloud )
engine = create_engine("mysql+mysqldb://root:" + pw + "@13.125.22.6/used_car")
df.to_sql(name="used_car", con=engine, if_exists='replace')