In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns 
import matplotlib.pyplot as plt
# for building model 
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
In [2]:
#load csv files
df_idtravel_seodata = pd.read_csv('/home/csv/idtravel-cleaned.csv',header=0,encoding = 'ISO-8859-1')
In [3]:
# Clean the NaN data, fill with 0
df_idtravel_seodata.fillna(0)
Out[3]:
URL Status Previous traffic Current traffic Traffic change Previous traffic value Current traffic value Traffic value change Previous # of keywords Current # of keywords Keywords change Previous top keyword Current top keyword Previous top keyword: Volume Current top keyword: Volume Previous top keyword: Position Current top keyword: Position Top keyword: Position change
0 https://www.indonesia.travel/id/id/destinasi/b... 0 53124.0 54287.0 1163.0 2873.04 2922.98 49.93 1242.0 1263.0 21.0 labuan bajo labuan bajo 200000.0 200000.0 1.0 1.0 0.0
1 https://www.indonesia.travel/id/id/home 0 27600.0 27073.0 -527.0 1532.06 1432.34 -99.72 1816.0 1816.0 0.0 indonesia indonesia 768000.0 768000.0 6.0 6.0 0.0
2 https://www.indonesia.travel/id/id/ide-liburan... 0 20141.0 21079.0 938.0 373.37 382.80 9.43 14576.0 14567.0 -9.0 tari tradisional tari tradisional 28000.0 28000.0 4.0 4.0 0.0
3 https://www.indonesia.travel/id/id/ide-liburan... 0 20517.0 20254.0 -263.0 2449.32 2429.70 -19.62 3929.0 4015.0 86.0 makanan tradisional makanan tradisional 55000.0 55000.0 1.0 1.0 0.0
4 https://www.indonesia.travel/id/id/ide-liburan... 0 11585.0 11590.0 5.0 1916.74 1916.79 0.05 8722.0 8775.0 53.0 makanan khas sumatera makanan khas sumatera 3400.0 3400.0 1.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5713 https://www.indonesia.travel/th/en/trip-ideas/... 0 0.0 0.0 0.0 0.00 0.00 0.00 1.0 1.0 0.0 kampoeng nelayan tanjung lesung kampoeng nelayan tanjung lesung 150.0 150.0 38.0 38.0 0.0
5714 https://www.indonesia.travel/nl/en/destination... New 0.0 0.0 0.0 0.00 0.00 0.00 0.0 1.0 1.0 0 jakarta to bogor 0.0 150.0 0.0 98.0 0.0
5715 https://www.indonesia.travel/in/en/trip-ideas/... 0 0.0 0.0 0.0 0.00 0.00 0.00 1.0 1.0 0.0 hallall food hallall food 0.0 0.0 22.0 22.0 0.0
5716 https://www.indonesia.travel/gb/en/news/indone... New 0.0 0.0 0.0 0.00 0.00 0.00 0.0 1.0 1.0 0 tour singapore 2016 0.0 150.0 0.0 28.0 0.0
5717 https://www.indonesia.travel/nl/nl/destination... 0 0.0 0.0 0.0 0.00 0.00 0.00 1.0 1.0 0.0 halmahera island halmahera island 100.0 100.0 24.0 24.0 0.0

5718 rows × 18 columns

In [4]:
print(df_idtravel_seodata.isna().any())
URL                               False
Status                             True
Previous traffic                   True
Current traffic                    True
Traffic change                     True
Previous traffic value             True
Current traffic value              True
Traffic value change               True
Previous # of keywords             True
Current # of keywords              True
Keywords change                    True
Previous top keyword               True
Current top keyword                True
Previous top keyword: Volume       True
Current top keyword: Volume        True
Previous top keyword: Position     True
Current top keyword: Position      True
Top keyword: Position change       True
dtype: bool
In [5]:
df_idtravel_seodata.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5718 entries, 0 to 5717
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   URL                             5718 non-null   object 
 1   Status                          138 non-null    object 
 2   Previous traffic                5667 non-null   float64
 3   Current traffic                 5631 non-null   float64
 4   Traffic change                  528 non-null    float64
 5   Previous traffic value          4247 non-null   float64
 6   Current traffic value           4229 non-null   float64
 7   Traffic value change            916 non-null    float64
 8   Previous # of keywords          5667 non-null   float64
 9   Current # of keywords           5631 non-null   float64
 10  Keywords change                 1012 non-null   float64
 11  Previous top keyword            5667 non-null   object 
 12  Current top keyword             5631 non-null   object 
 13  Previous top keyword: Volume    5667 non-null   float64
 14  Current top keyword: Volume     5631 non-null   float64
 15  Previous top keyword: Position  5667 non-null   float64
 16  Current top keyword: Position   5631 non-null   float64
 17  Top keyword: Position change    5422 non-null   float64
dtypes: float64(14), object(4)
memory usage: 804.2+ KB
In [6]:
# Count total keywords rank 1-50
ranking_queries = df_idtravel_seodata.pivot_table(index=['Current top keyword: Position'], values=['Current top keyword'], aggfunc=['count'])
ranking_queries.sort_values(by=['Current top keyword: Position']).head(10)
Out[6]:
count
Current top keyword
Current top keyword: Position
1.0 477
2.0 235
3.0 267
4.0 271
5.0 225
6.0 196
7.0 214
8.0 164
9.0 178
10.0 142
In [7]:
# Plot ranking distribution 1-10

ax = ranking_queries.head(10).plot(kind='bar')
ax.set_ylabel('Current top keyword')
ax.set_xlabel('Current top keyword: Position')
ax.set_title('Ranking distribution')
ax.grid('on')
ax.get_legend().remove()
No description has been provided for this image
In [8]:
df_idtravel_seodata.describe()
Out[8]:
Previous traffic Current traffic Traffic change Previous traffic value Current traffic value Traffic value change Previous # of keywords Current # of keywords Keywords change Previous top keyword: Volume Current top keyword: Volume Previous top keyword: Position Current top keyword: Position Top keyword: Position change
count 5667.000000 5631.000000 528.000000 4247.000000 4229.000000 916.000000 5667.000000 5631.000000 1012.000000 5667.000000 5631.000000 5667.000000 5631.000000 5422.000000
mean 88.166402 88.264429 -4.965909 8.514344 8.384372 -0.767533 24.859891 24.991298 -0.153162 1384.448562 1331.164980 22.972649 22.813177 0.019181
std 975.586480 988.743635 134.317414 79.446409 78.552313 11.350008 278.186846 279.692821 4.285930 13099.208190 12926.297471 23.041893 23.003635 0.777853
min 0.000000 0.000000 -1742.000000 0.000000 0.000000 -189.620000 1.000000 1.000000 -15.000000 0.000000 0.000000 1.000000 1.000000 -9.000000
25% 0.000000 0.000000 -4.000000 0.000000 0.000000 -0.010000 1.000000 1.000000 -1.000000 50.000000 50.000000 5.000000 5.000000 0.000000
50% 1.000000 1.000000 1.000000 0.020000 0.020000 0.000000 3.000000 3.000000 -1.000000 100.000000 100.000000 14.000000 14.000000 0.000000
75% 12.000000 12.000000 4.000000 0.900000 0.910000 0.000000 10.000000 10.000000 1.000000 250.000000 250.000000 35.000000 34.000000 0.000000
max 53124.000000 54287.000000 1163.000000 2873.040000 2922.980000 60.560000 14576.000000 14567.000000 86.000000 768000.000000 768000.000000 102.000000 102.000000 35.000000
In [ ]: