Predict fuel flow rate of airplanes

  • There are around 220 different parameters
  • The dataset is 12 Gb and 1000 flights
  • Most of the flights are domestic US flights
In [2]:
import matplotlib.pyplot as plt
import matplotlib.cm
import pandas as pd 
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize
import warnings
warnings.filterwarnings('ignore')

fig, ax = plt.subplots(figsize=(16,12))
# coordinates are taken from http://boundingbox.klokantech.com/

coorlist = [-133.15,24.29,-61.52,54.37]
rout_color = (204/255.0, 0, 153/255.0, 0.7)#"orange"
bg_color = (0.0, 0.0, 0, 0.0)
coast_color = (0.0, 0.0, 0, 1.0)
#coast_color = (204/255.0, 0, 153/255.0, 0.7)
fl_color = (204/255.0, 0, 153/255.0, 0.7)
m = Basemap(resolution='l', # c, l, i, h, f or None
            projection='merc',
            llcrnrlon=coorlist[0], llcrnrlat= coorlist[1], urcrnrlon=coorlist[2], urcrnrlat=coorlist[3])
m.drawcountries()
m.drawcoastlines(color=coast_color, linewidth=1.0)
m.fillcontinents(color=bg_color)
m.drawmapboundary(fill_color=bg_color)
m.shadedrelief()
path = pd.read_csv('ALL_data.csv', index_col=False)
resmap = path.groupby(['Flight_instance_ID'])

for ni, pos in enumerate(resmap):
    bla = pos[1]
    x1 = list(bla.LONP)
    y1 = list(bla.LATP)
    x, y = m(x1,y1)
    # in case of out of box, gives an error
    try:
        m.drawgreatcircle(x1[0], y1[0], x1[-1], y1[-1], linewidth=0.75, color="red", alpha = 0.2)
        m.plot(x, y, 'o', markersize='6', color='#444444', alpha=0.1)
    except:
        pass
plt.show()
In [3]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import pandas as pd
from datetime import datetime
from babel.dates import format_timedelta


#list_ = []
#z = zipfile.ZipFile('CAX_Train_1.zip')
#allFiles = z.namelist()
#df_from_each_file = (pd.read_csv(z.open(f), index_col=False) for f in allFiles[0:1])
#frame   = pd.concat(df_from_each_file, ignore_index=True)
list_f_ids = ['676200403231928',
 '676200403201250',
 '676200306100458',
 '676200408241718',
 '676200404101643']

st = str(list_f_ids[2])
moddir = 'models_rand_allf/ind_2000run_nocoor/'#'model_2/'
frame = pd.read_csv('some_flights/'+st+'.csv')
#'test_676200312080912.csv'
def to_dattim(y, m, d, h, mi, s):
    return pd.Timestamp(int(y), int(m), int(d), int(h), int(mi), int(s))

a = frame.columns

frame["datetime"] =  frame.apply(lambda row: to_dattim(row['Year'], row['Month'], row['Day'], row['Hour'], row['Minute'], row['Second']), axis=1)

temp = frame.groupby(['Flight_instance_ID']).datetime.first()

def minus_first(x,y,z):
    return y - temp[x]

frame.datetime = frame.apply(lambda row: minus_first(row['Flight_instance_ID'], row['datetime'],temp), axis=1)
In [ ]:
import dill
Mod_num = str(5)
modelpath = "models_best/model"+Mod_num+"_fix/model_new_all_phase_"+Mod_num+".pkd"
with open(modelpath,'r') as f:
    modelx = dill.load(f)

feat_imp = pd.Series(modelx.get_fscore()).sort_values(ascending=False)
feat_imp[:10].plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
print modelx.best_score

7 different phases of flight in the dataset

  • Most fuel is spent during Climb and Cruise
In [5]:
import dill
def plot_pred(Mod_num, ff_id, axi, start,  shuf = False, pred = 1):
    lc = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    zipnam = 'models_best/model'+Mod_num+'/phase'+Mod_num+'_test_separated.zip'
    z = zipfile.ZipFile(zipnam)
    finzip = 'phase'+Mod_num+'_test_separated/phase'+Mod_num+'_'+ff_id+'.csv'
    df = pd.read_csv(z.open(finzip), index_col=False)
    modelpath = "models_best/model"+Mod_num+"_fix/model_new_all_phase_"+Mod_num+".pkd"
    msi = 6
    if shuf:
        df = df.sample(frac = 1)
        msi = 3
        
    ln = len(df.FF)
    y = df.pop('FF')
    
    ##strd = pd.Timestamp('2017-1-1 00:00:00')+ pd.Timedelta(start, unit='s')
    ##endd = strd + pd.Timedelta(ln-1, unit='s')
    ##x = pd.date_range(start=strd, end=endd, freq = 's')   
    x = range(start,ln +start)
    if Mod_num =='4':
        axi.plot(x, y,'o',ms=msi,  label = 'Climb')
    elif Mod_num =='5':
        axi.plot(x, y,'o',ms=msi,  label = 'Cruise')
    if pred:     
        df = df.drop(['X','Y','DWPT','Unnamed: 0','Unnamed: 0.1', 'Flight_instance_ID'], axis = 1)
        xtest = xgb.DMatrix(df, label = y)
        with open(modelpath,'r') as f:
            modelx = dill.load(f)
        pred = modelx.predict(xtest)
        
        if Mod_num =='4':
            axi.plot(x, pred, color = 'red', lw =0.5, label = 'pred. Climb')
        elif Mod_num =='5':
            axi.plot(x, pred, color = 'black', lw =0.5, label = 'pred. Cruise')

    return ln
In [6]:
### plot all phases
import warnings
import sklearn.metrics
import xgboost as xgb
from ipywidgets import StaticInteract, RangeWidget, RadioWidget, DropDownWidget
warnings.filterwarnings('ignore')
lph = ['Unknown','Preflight','Taxi','Takeoff', 'Climb','Cruise','Approach','Rollout']
dat = pd.Timestamp('2017-1-1 00:00:00')
datmax = dat+ frame.datetime.iloc[-1]


def plot_phase(ax,xmi = dat, xma = datmax):
    lc = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    for key, grp in frame.groupby(['Flight_instance_ID','PH']):
        ph = grp['PH'].head(1).reset_index(drop=True)[0]
        grp['datetime']=grp['datetime'].apply(lambda x: x+dat)
        #print grp.datetime.iloc[0], dat, type(grp.datetime.iloc[0]), type(dat)
        if ph<7 and ph!=4 and ph!=5:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph],color = lc[ph-1])
        elif ph ==4 or ph == 5:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph])
        else:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph],color='black')
        if ph == 4:
            # take series for 4th phase to plot
            bla = grp['datetime']
        elif ph ==5:
            bla5 = grp['datetime']
    fig.canvas.draw()
    labels = [item.get_text() for item in ax.get_xticklabels()]
    labels = [i[3:] for i in labels]
    labels = labels[1:]
    #labels[1] = 'Testing'
    ax.set_xticklabels(labels, fontsize = 18)
    labels = [item.get_text() for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels, fontsize = 18)

    plt.xticks(rotation=45)
    ax.set_xlim(xmin = xmi, xmax = xma)
    plt.ylabel('Fuel Flow Rate',fontsize = 20)
    plt.xlabel('Time of Flight, [min]',fontsize = 20)
In [7]:
#matplotlib.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(15, 9))
plot_phase(ax)
plt.legend(fontsize = 20)
plt.savefig('fig1_allfeatures.png')
plt.show()

Focus analysis on two phases: Climb and Cruise

  • Which seems most promising for saving fuel
In [8]:
fig, ax = plt.subplots(figsize=(15, 9))
a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig2_45features.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('fuel flow rate',fontsize = 20)
plt.xlabel('index',fontsize = 20)
plt.show()

To exclude per-flight dependence

  • Shuffle points for each phase
  • Delete all features which relate data points to flights
In [9]:
fig, ax = plt.subplots(figsize=(15, 9))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig3_45features_shuf.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('fuel flow rate',fontsize = 20)
plt.xlabel('index',fontsize = 20)
plt.show()

Train model on shuffled training data set

  • I used XGBoost to predict Climbing and Cruise phase
  • Model trained for every phase independently
In [10]:
fig, ax = plt.subplots(figsize=(15, 9))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('fuel flow rate',fontsize = 20)
plt.xlabel('index',fontsize = 20)
plt.show()

Can predict unshuffled test set flights with trained model

  • r^2 error for train sample phase4 and phase 5: 0.9993 and 0.9933
  • r^2 error for test sample phase4 and phase 5: 0.9886 and 0.9185
In [11]:
fig, ax = plt.subplots(figsize=(15, 9))

#min_test=['676200308221151',
# '676200406201308',
# '676200406201655',
# '676201003110757',
# '676200405101740',
# '676200306121943',
# '676200307021338',
# '676200307030733',
# '676200406181158',
# '676200403090417']
#max_test = ['676200310231759',
# '676200306171503',
# '676200306180951',
# '676200406160506',
# '676200405081024',
# '676200402101617',
# '676200305141328']
#st = max_test[0]

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('fuel flow rate',fontsize = 20)
plt.xlabel('index',fontsize = 20)
plt.show()
In [12]:
import matplotlib
import matplotlib.pyplot as plt
import dill
import zipfile
import pandas as pd
import xgboost as xgb
import glob
import numpy as np
import matplotlib.mlab as mlab
from sklearn.metrics import r2_score

def get_stats(li):    
    liflights = []
    liflights_id = []
    allpoints = []
    r2 = 0
    for fn in li:
        fid = '6'+fn.split('.csv')[0].split('_6')[-1]
        df = pd.read_csv(fn)
        se = (df.FF-df.Prediction)/df.FF
        allpoints += list(se.as_matrix())
        liflights.append(np.mean(se))
        liflights_id.append((np.mean(se),fid))
        r2 +=r2_score(df.FF, df.Prediction)
    return (allpoints, liflights, liflights_id,r2/len(liflights))

li = glob.glob('models_best/model4/phase4_test/residuals_phase4_67*')
all_test4, liflights_test4, ids_test4, r2_4te = get_stats(li)

li = glob.glob('models_best/model4/phase4_train/residuals_phase4_67*')
all_train4, liflights_train4, ids_train4, r2_4tr = get_stats(li)

li = glob.glob('models_best/model5/phase5_test/residuals_phase5_6*')
all_test5, liflights_test5, ids_test5, r2_5te = get_stats(li)

li = glob.glob('models_best/model5/phase5_train/residuals_phase5_6*')
all_train5, liflights_train5, ids_train5, r2_5tr = get_stats(li)
In [ ]:
f, ((ax1), (ax2)) = plt.subplots(1, 2, figsize=(15, 4.5), sharex='row')#, sharey='row')

n, bins, patches = ax1.hist(np.array(liflights_test4), 25, normed=0, alpha=0.75, label = 'test')

n, bins, patches = ax1.hist(np.array(liflights_train4), 25, normed=0, alpha=0.75, label = 'train')

#m1 = max(liflights_test5)
#m2 = max(liflights_train5)
n, bins, patches = ax2.hist(np.array(liflights_test5), 25, normed=0, alpha=0.75, label = 'test')
n, bins, patches = ax2.hist(np.array(liflights_train5), 25, normed=0, alpha=0.75, label = 'train')
ax1.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax2.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb')
ax2.set_title('Phase 5: Cruise')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
#plt.ylim(ymax=80)
plt.show()

Model error of random set of points compared with prediction error for real flight

  • Model error of random set is estimated as the standard deviation if the flights would consist of random points
  • Prediction error for real flights is computed as average residual for each flight
In [18]:
f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(15, 8), sharex='col')

def plothi(all_, liflights_, sc, axi):
    mu = 0
    scale = 6000
    variance = np.var(all_)
    coef = 1./np.sqrt(len(all_)/len(liflights_))
    sigma = np.sqrt(variance)*coef
    x = np.linspace(-10./scale, 10./scale, 500)
    thr_max = np.percentile(np.array(liflights_),90)
    thr_min = np.percentile(np.array(liflights_),10)
    hi = np.array(liflights_)
    n, bins, patches = axi.hist([hi[np.multiply(hi<thr_max,hi>thr_min)],hi[hi<thr_min],hi[hi>thr_max]], 25, normed=1, alpha=0.75, color = ['C0','C1','C1'], stacked = True, label = 'E_flights')
    axi.plot(x,mlab.normpdf(x, mu, sigma)*sc, lw = 2, color = 'C8',label = 'E_model')
    axi.legend(fontsize = 20)
    axi.tick_params(axis='both', labelsize=18)

    
plothi(all_train4, liflights_train4, 1./3., ax1)
plothi(all_test4, liflights_test4, 0.083, ax2)
plothi(all_train5, liflights_train5, 1./3., ax3)
plothi(all_test5, liflights_test5, 0.055, ax4)

ax3.set_xlabel('residuals',fontsize = 18)
ax4.set_xlabel('residuals',fontsize = 18)

ax1.set_ylabel('freq. of flights',fontsize = 18)
ax3.set_ylabel('freq. of flights',fontsize = 18)

ax1.set_title('Phase 4: Climb, Train sample', fontsize = 18)
ax2.set_title('Phase 4: Climb, Test sample', fontsize = 18)
ax3.set_title('Phase 5: Climb, Train sample', fontsize = 18)
ax4.set_title('Phase 5: Climb, Test sample', fontsize = 18)

plt.show()
In [15]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[15]: