In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
# for building model
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
In [2]:
#load csv files
df_idtravel_seodata = pd.read_csv('/home/csv/idtravel-cleaned.csv',header=0,encoding = 'ISO-8859-1')
In [3]:
# Clean the NaN data, fill with 0
df_idtravel_seodata.fillna(0)
Out[3]:
| URL | Status | Previous traffic | Current traffic | Traffic change | Previous traffic value | Current traffic value | Traffic value change | Previous # of keywords | Current # of keywords | Keywords change | Previous top keyword | Current top keyword | Previous top keyword: Volume | Current top keyword: Volume | Previous top keyword: Position | Current top keyword: Position | Top keyword: Position change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://www.indonesia.travel/id/id/destinasi/b... | 0 | 53124.0 | 54287.0 | 1163.0 | 2873.04 | 2922.98 | 49.93 | 1242.0 | 1263.0 | 21.0 | labuan bajo | labuan bajo | 200000.0 | 200000.0 | 1.0 | 1.0 | 0.0 |
| 1 | https://www.indonesia.travel/id/id/home | 0 | 27600.0 | 27073.0 | -527.0 | 1532.06 | 1432.34 | -99.72 | 1816.0 | 1816.0 | 0.0 | indonesia | indonesia | 768000.0 | 768000.0 | 6.0 | 6.0 | 0.0 |
| 2 | https://www.indonesia.travel/id/id/ide-liburan... | 0 | 20141.0 | 21079.0 | 938.0 | 373.37 | 382.80 | 9.43 | 14576.0 | 14567.0 | -9.0 | tari tradisional | tari tradisional | 28000.0 | 28000.0 | 4.0 | 4.0 | 0.0 |
| 3 | https://www.indonesia.travel/id/id/ide-liburan... | 0 | 20517.0 | 20254.0 | -263.0 | 2449.32 | 2429.70 | -19.62 | 3929.0 | 4015.0 | 86.0 | makanan tradisional | makanan tradisional | 55000.0 | 55000.0 | 1.0 | 1.0 | 0.0 |
| 4 | https://www.indonesia.travel/id/id/ide-liburan... | 0 | 11585.0 | 11590.0 | 5.0 | 1916.74 | 1916.79 | 0.05 | 8722.0 | 8775.0 | 53.0 | makanan khas sumatera | makanan khas sumatera | 3400.0 | 3400.0 | 1.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5713 | https://www.indonesia.travel/th/en/trip-ideas/... | 0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 1.0 | 1.0 | 0.0 | kampoeng nelayan tanjung lesung | kampoeng nelayan tanjung lesung | 150.0 | 150.0 | 38.0 | 38.0 | 0.0 |
| 5714 | https://www.indonesia.travel/nl/en/destination... | New | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 | 1.0 | 1.0 | 0 | jakarta to bogor | 0.0 | 150.0 | 0.0 | 98.0 | 0.0 |
| 5715 | https://www.indonesia.travel/in/en/trip-ideas/... | 0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 1.0 | 1.0 | 0.0 | hallall food | hallall food | 0.0 | 0.0 | 22.0 | 22.0 | 0.0 |
| 5716 | https://www.indonesia.travel/gb/en/news/indone... | New | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 | 1.0 | 1.0 | 0 | tour singapore 2016 | 0.0 | 150.0 | 0.0 | 28.0 | 0.0 |
| 5717 | https://www.indonesia.travel/nl/nl/destination... | 0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.00 | 1.0 | 1.0 | 0.0 | halmahera island | halmahera island | 100.0 | 100.0 | 24.0 | 24.0 | 0.0 |
5718 rows × 18 columns
In [4]:
print(df_idtravel_seodata.isna().any())
URL False Status True Previous traffic True Current traffic True Traffic change True Previous traffic value True Current traffic value True Traffic value change True Previous # of keywords True Current # of keywords True Keywords change True Previous top keyword True Current top keyword True Previous top keyword: Volume True Current top keyword: Volume True Previous top keyword: Position True Current top keyword: Position True Top keyword: Position change True dtype: bool
In [5]:
df_idtravel_seodata.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5718 entries, 0 to 5717 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 URL 5718 non-null object 1 Status 138 non-null object 2 Previous traffic 5667 non-null float64 3 Current traffic 5631 non-null float64 4 Traffic change 528 non-null float64 5 Previous traffic value 4247 non-null float64 6 Current traffic value 4229 non-null float64 7 Traffic value change 916 non-null float64 8 Previous # of keywords 5667 non-null float64 9 Current # of keywords 5631 non-null float64 10 Keywords change 1012 non-null float64 11 Previous top keyword 5667 non-null object 12 Current top keyword 5631 non-null object 13 Previous top keyword: Volume 5667 non-null float64 14 Current top keyword: Volume 5631 non-null float64 15 Previous top keyword: Position 5667 non-null float64 16 Current top keyword: Position 5631 non-null float64 17 Top keyword: Position change 5422 non-null float64 dtypes: float64(14), object(4) memory usage: 804.2+ KB
In [6]:
# Count total keywords rank 1-50
ranking_queries = df_idtravel_seodata.pivot_table(index=['Current top keyword: Position'], values=['Current top keyword'], aggfunc=['count'])
ranking_queries.sort_values(by=['Current top keyword: Position']).head(10)
Out[6]:
| count | |
|---|---|
| Current top keyword | |
| Current top keyword: Position | |
| 1.0 | 477 |
| 2.0 | 235 |
| 3.0 | 267 |
| 4.0 | 271 |
| 5.0 | 225 |
| 6.0 | 196 |
| 7.0 | 214 |
| 8.0 | 164 |
| 9.0 | 178 |
| 10.0 | 142 |
In [7]:
# Plot ranking distribution 1-10
ax = ranking_queries.head(10).plot(kind='bar')
ax.set_ylabel('Current top keyword')
ax.set_xlabel('Current top keyword: Position')
ax.set_title('Ranking distribution')
ax.grid('on')
ax.get_legend().remove()
In [8]:
df_idtravel_seodata.describe()
Out[8]:
| Previous traffic | Current traffic | Traffic change | Previous traffic value | Current traffic value | Traffic value change | Previous # of keywords | Current # of keywords | Keywords change | Previous top keyword: Volume | Current top keyword: Volume | Previous top keyword: Position | Current top keyword: Position | Top keyword: Position change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5667.000000 | 5631.000000 | 528.000000 | 4247.000000 | 4229.000000 | 916.000000 | 5667.000000 | 5631.000000 | 1012.000000 | 5667.000000 | 5631.000000 | 5667.000000 | 5631.000000 | 5422.000000 |
| mean | 88.166402 | 88.264429 | -4.965909 | 8.514344 | 8.384372 | -0.767533 | 24.859891 | 24.991298 | -0.153162 | 1384.448562 | 1331.164980 | 22.972649 | 22.813177 | 0.019181 |
| std | 975.586480 | 988.743635 | 134.317414 | 79.446409 | 78.552313 | 11.350008 | 278.186846 | 279.692821 | 4.285930 | 13099.208190 | 12926.297471 | 23.041893 | 23.003635 | 0.777853 |
| min | 0.000000 | 0.000000 | -1742.000000 | 0.000000 | 0.000000 | -189.620000 | 1.000000 | 1.000000 | -15.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | -9.000000 |
| 25% | 0.000000 | 0.000000 | -4.000000 | 0.000000 | 0.000000 | -0.010000 | 1.000000 | 1.000000 | -1.000000 | 50.000000 | 50.000000 | 5.000000 | 5.000000 | 0.000000 |
| 50% | 1.000000 | 1.000000 | 1.000000 | 0.020000 | 0.020000 | 0.000000 | 3.000000 | 3.000000 | -1.000000 | 100.000000 | 100.000000 | 14.000000 | 14.000000 | 0.000000 |
| 75% | 12.000000 | 12.000000 | 4.000000 | 0.900000 | 0.910000 | 0.000000 | 10.000000 | 10.000000 | 1.000000 | 250.000000 | 250.000000 | 35.000000 | 34.000000 | 0.000000 |
| max | 53124.000000 | 54287.000000 | 1163.000000 | 2873.040000 | 2922.980000 | 60.560000 | 14576.000000 | 14567.000000 | 86.000000 | 768000.000000 | 768000.000000 | 102.000000 | 102.000000 | 35.000000 |
In [ ]: