Quick Summary
5,000 data points with pseudo-geological information (including proven gold reserves).
50 (or more) new sites up for auction with limited data.
Each team starts with $50,000,000.00 budget to bid with.
Blind, sealed-bid auctions for rights to mine the parcel of land (min bid $100,000.00)
Auctions happen in order by parcel_id
Extraction costs are non-trivial.
Winning team has most cash at the end.
The link for the competition
In [1]:
import numpy as np
import pandas as pd
In [2]:
#Load data
costs_data = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\elDorado\costs_data.csv")
elevation_data = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\elDorado\elevation_data.csv")
sample_data = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\elDorado\sample_data.csv")
In [3]:
#merge sample_data and elevation datasets on parcel_id
train = pd.merge(sample_data,elevation_data, on='parcel_id', how = 'inner',suffixes=('_left', '_right'))
#drop duplicate
train.rename(columns={'Easting_left': 'Easting','Northing_left': 'Northing'}, inplace=True)
train = train.drop(['Easting_right','Northing_right'],axis=1)
In [4]:
#Join the the test and train dataframes
def get_combined_data():
# reading train data
train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\elDorado\train_data.csv")
# reading test data
test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\elDorado\auction_parcels.csv")
# extracting and then removing the targets from the training data
targets = train.gold_available
train.drop('gold_available',1,inplace=True)
# merging train data and test data for future feature engineering
combined = train.append(test)
combined.reset_index(inplace=True)
combined.drop('index',inplace=True,axis=1)
return combined
combined = get_combined_data()
Same as total in sample_data and auctin_parcels
In [7]:
#drop unnamed column
combined.drop('Unnamed: 0',inplace=True,axis=1)
In [10]:
combined.tail(2)
Out[10]:
In [9]:
combined.shape
Out[9]:
Fill in the NaN values with mean. (mean and median is the same)
In [11]:
combined["Gneiss"].fillna(combined["Gneiss"].mean(), inplace=True)
combined["Hedbergite"].fillna(combined["Hedbergite"].mean(), inplace=True)
combined["Isogen"].fillna(combined["Isogen"].mean(), inplace=True)
combined["Mexallon"].fillna(combined["Mexallon"].mean(), inplace=True)
combined["Nocxium"].fillna(combined["Nocxium"].mean(), inplace=True)
combined["Plagioclase"].fillna(combined["Plagioclase"].mean(), inplace=True)
combined["Pyerite"].fillna(combined["Pyerite"].mean(), inplace=True)
combined["Spudumain"].fillna(combined["Spudumain"].mean(), inplace=True)
combined["Tritanium"].fillna(combined["Tritanium"].mean(), inplace=True)
combined["Veldspar"].fillna(combined["Veldspar"].mean(), inplace=True)
combined["Megacyte"].fillna(combined["Megacyte"].mean(), inplace=True)
In [12]:
print('Reading Training data')
print('\nSize of Training data: ' + str(combined.shape))
print('Columns:' + str(combined.columns.values))
print('dtypes')
print('\n')
print(combined.dtypes)
print('\n')
print('Info: ')
print('\n')
print(combined.info)
print('Shape: ')
print('\n')
print(combined.shape)
print('\n')
print('numerical columns statistcs')
print('\n')
print(combined.describe())
import re
# Review input features (train set) - Part 2A
missing_values = []
nonumeric_values = []
print ("========================\n")
for column in combined:
# Find all the unique feature values
uniq = combined[column].unique()
print ("'{}' has {} unique values" .format(column,uniq.size))
# Find features with missing values
if (True in pd.isnull(uniq)):
s = "{} has {} missing" .format(column, pd.isnull(combined[column]).sum())
missing_values.append(s)
# Find features with non-numeric values
for i in range (1, np.prod(uniq.shape)):
if (re.match('nan', str(uniq[i]))):
break
if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
nonumeric_values.append(column)
break
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")