In [6]:
import MySQLdb, pickle
import pandas as pd
from sqlalchemy import create_engine
from collections import Counter
In [7]:
# Read car_info from MySQL at Amazon Web Service Cloud 
pw = pickle.load(open('./Flask/models/pw.plk','rb'))

db = MySQLdb.connect(
    "13.125.22.6",
    "root",
    pw,
    "used_car",
    charset='utf8',
)

# SQL Query
SQL_QUERY = """
    SELECT *
    FROM used_car;
"""

# Bring car information from MySQL into dataframe
train = pd.read_sql(SQL_QUERY, db)
train.head()
Out[7]:
index year title brand model miles photos video exterior_color interior_color transmission drivetrain star review_no vendor price
0 0 2003 mazda tribute lx v6 mazda tribute 118205 21 1 silver gray 4-speed fwd 5 43 1st choice autos 3750
1 1 2016 subaru crosstrek 2.0i subaru crosstrek 26525 23 1 desert black 5-speed 4wd 5 43 1st choice autos 17999
2 2 2018 ford f-150 raptor ford f-150 2330 32 1 race black 10-speed 4wd 5 244 gilbert & baugh ford 53900
3 3 2015 lexus rc f lexus rc 27080 23 1 black red 8-speed rwd 5 6 carmax south boulevard 43998
4 4 2014 mercedes-benz s 550 mercedes-benz s 121078 32 1 palladium silk beige 7-speed rwd 5 939 hendrickbmw 32991
In [3]:
# Saving data to the database
pickle.dump(train, open("./Flask/models/database.plk","wb"))
In [ ]:
# Group by brand and model
brand_group = list(set(database["brand"]))
model_group = list(set(database["model"]))
In [4]:
# Top 30 car brands
brand_list = []
for brand in Counter(train.brand).most_common(30):
    brand_list.append(brand[0])
In [5]:
# Check the index of data not included in the top 30 car brands
idx_list = []
idx = 0
for i in train["brand"]:
    if i not in brand_list:
        idx_list.append(idx)
    idx += 1
In [7]:
# Only the top 30 car brands are filtered
train = train.drop(idx_list)
train.reset_index(drop=True, inplace=True)
train = train.drop("index", axis=1)

Feature engineering

In [8]:
# Select models and brands as category variables
categorical_features = ['brand', 'model']
In [9]:
# Dummy category variable
dummy_cat = pd.get_dummies(train[categorical_features])
dummy_cat.head()
Out[9]:
brand_acura brand_audi brand_bmw brand_buick brand_cadillac brand_chevrolet brand_chrysler brand_dodge brand_ford brand_gmc ... model_xf model_xj model_xk model_xt5 model_xterra model_xts model_xv model_yaris model_yukon model_z4
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 433 columns

In [10]:
# Select year, miles, price by numeric variable
Select by numeric variable
numerical_features = ['year', 'miles','price']
In [11]:
# Normalized numeric Variables
normalize_num = np.log1p(train[numerical_features])
normalize_num.head()
Out[11]:
year miles price
0 7.602900 11.680184 8.229778
1 7.609367 10.185881 9.798127
2 7.610358 7.754053 10.894904
3 7.608871 10.206588 10.691922
4 7.608374 11.704199 10.404020
In [12]:
# Join numeric variable with categoric variable
X_train_0 = normalize_num.join(dummy_cat)

# Seperate price as y value
y_train = X_train_0["price"]
X_train = X_train_0.drop("price", axis=1)

Model : XGBoost Model

XGBoost is short for “Extreme Gradient Boosting”, where the term “Gradient Boosting” is proposed in the paper Greedy Function Approximation: A Gradient Boosting Machine, by Friedman. XGBoost is based on this original model.

XGBoost is used for supervised learning problems, where we use the training data (with multiple features) x to predict a target variable y. Before we dive into trees, let us start by reviewing the basic elements in supervised learning.

In [13]:
from xgboost import XGBRegressor
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# K-ford : 10
k_fold = KFold(n_splits=10, shuffle=True, random_state=2018)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)
/Users/sunghwanki/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [14]:
# Create XGBoost Regressor model
ml = XGBRegressor(n_estimators=1000, learning_rate=0.05, verbose=False)
In [15]:
# Check the corss validation score
%time score = cross_val_score(ml, X_train, y_train, cv=k_fold, n_jobs=-1, scoring="r2").mean()
print("Score = {0:.5f}".format(score))
CPU times: user 195 ms, sys: 60.3 ms, total: 256 ms
Wall time: 2min 18s
Score = 0.83457
In [16]:
# Train model
ml = ml.fit(X_train1, y_train1)

# Predict test data set
y_pred = ml.predict(X_test1)

# Drawing regression graph
plt.figure(figsize=(10, 5))
plt.scatter(y_test1, y_pred, s=20)
plt.title('Predicted vs. Actual')
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')

plt.plot([min(y_test1), max(y_test1)], [min(y_test1), max(y_test1)])
plt.tight_layout()
In [17]:
# Save the model to the pickle file
pickle.dump(ml, open("./Flask/models/model.plk","wb"))
In [18]:
# Set up a dataset using "brand", "model", "year", "miles", "price"
actual_car_info = train[["brand", "model","year","miles","price"]]

# Save the dataset to the pickle file
pickle.dump(actual_car_info, open("./Flask/models/actual_car_info.plk","wb"))

Enter Used Car information

In [19]:
# Enter the information of the vehicle you want to know about the estimated price of used car
brand = str(input("brand: "))
model = str(input("model: "))
year = int(input("year: "))
miles = int(input("miles: "))
brand: kia
model: optima
year: 2015
miles: 30000
In [20]:
# Save the variable column used to train the model as a data frame
target = pd.DataFrame(columns = [X_train1.columns])
In [21]:
# Save the variable column to the pickle file
pickle.dump(X_train1.columns, open("./Flask/models/column.plk","wb"))
In [22]:
# Check the index location of the selected used car brand in the variable column data frame
brand_index = 0
for col in X_train.columns:
    if col == 'brand'+"_"+brand:
        break;
    brand_index += 1
brand_index
Out[22]:
17
In [23]:
# Check the index location of the selected used car model in the variable column data frame
model_index = 0
for col in X_train.columns:
    if col == 'model'+"_"+model:
        break;
    model_index += 1
model_index
Out[23]:
302
In [24]:
# Array of zeros
target_list = np.zeros_like(X_train.loc[0])
In [25]:
# Save the target_list to pickle file
pickle.dump(target_list, open("./Flask/models/target_list.plk","wb"))
In [26]:
# Put the number 1 in the selected brand and model locations in the data frame
target_list[brand_index] = 1
target_list[model_index] = 1

# Put the year and miles in the data frame
target_list[0] = year
target_list[1] = miles
In [27]:
# Insert data into target data frame 
for i in range(1):
    target.loc[i] = target_list
In [28]:
target
Out[28]:
year miles brand_acura brand_audi brand_bmw brand_buick brand_cadillac brand_chevrolet brand_chrysler brand_dodge ... model_xf model_xj model_xk model_xt5 model_xterra model_xts model_xv model_yaris model_yukon model_z4
0 2015.0 30000.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1 rows × 435 columns

In [29]:
# Nomalizing numerical features
numerical_features = ['year', 'miles']
target[numerical_features] = np.log1p(target[numerical_features])
In [30]:
target
Out[30]:
year miles brand_acura brand_audi brand_bmw brand_buick brand_cadillac brand_chevrolet brand_chrysler brand_dodge ... model_xf model_xj model_xk model_xt5 model_xterra model_xts model_xv model_yaris model_yukon model_z4
0 7.608871 10.308986 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1 rows × 435 columns

Predict Used_Car Values

In [31]:
# Predicted logged price
price_log = ml.predict(target)
In [32]:
price_log
Out[32]:
array([ 9.68053722], dtype=float32)
In [33]:
# Revert the logged price back to its original price
price = np.exp(price_log)
print("Price:", int(price))
Price: 16003

Compare same model with different years

In [34]:
# Find the average of the same model for each year
same_model = actual_car_info[actual_car_info["model"]==model] 
year_price = same_model[["year", "price"]]
year_price_list = year_price.groupby("year").agg({'price':np.mean}).astype('int')
year_price_list = year_price_list.reset_index()
year_price_list
Out[34]:
year price
0 2011 9953
1 2012 9856
2 2013 11285
3 2014 12945
4 2015 14393
5 2016 14925
6 2017 18514
In [35]:
# Convert year from number type to character type
year_price_list["year"] = year_price_list["year"].apply(lambda x: str(x) )
In [36]:
# Convert price from number type to character type
year_price_list["price"] = year_price_list["price"].apply(lambda x: str(x) )
In [37]:
list(year_price_list["price"])
Out[37]:
['9953', '9856', '11285', '12945', '14393', '14925', '18514']