Predict fuel flow rate of airplanes¶

There are around 220 different parameters
The dataset is 12 Gb and 1000 flights
Most of the flights are domestic US flights

In [327]:

import matplotlib.pyplot as plt
import matplotlib.cm
import pandas as pd 
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize
import warnings
warnings.filterwarnings('ignore')

fig, ax = plt.subplots(figsize=(16,12))
# coordinates are taken from http://boundingbox.klokantech.com/

coorlist = [-133.15,24.29,-61.52,54.37]
rout_color = (204/255.0, 0, 153/255.0, 0.7)#"orange"
bg_color = (0.0, 0.0, 0, 0.0)
coast_color = (0.0, 0.0, 0, 1.0)
#coast_color = (204/255.0, 0, 153/255.0, 0.7)
fl_color = (204/255.0, 0, 153/255.0, 0.7)
m = Basemap(resolution='l', # c, l, i, h, f or None
            projection='merc',
            llcrnrlon=coorlist[0], llcrnrlat= coorlist[1], urcrnrlon=coorlist[2], urcrnrlat=coorlist[3])
m.drawcountries()
m.drawcoastlines(color=coast_color, linewidth=1.0)
m.fillcontinents(color=bg_color)
m.drawmapboundary(fill_color=bg_color)
m.shadedrelief()
path = pd.read_csv('ALL_data.csv', index_col=False)
resmap = path.groupby(['Flight_instance_ID'])

for ni, pos in enumerate(resmap):
    bla = pos[1]
    x1 = list(bla.LONP)
    y1 = list(bla.LATP)
    x, y = m(x1,y1)
    # in case of out of box, gives an error
    try:
        m.drawgreatcircle(x1[0], y1[0], x1[-1], y1[-1], linewidth=0.75, color="red", alpha = 0.2)
        m.plot(x, y, 'o', markersize='6', color='#444444', alpha=0.1)
    except:
        pass
plt.show()

In [147]:

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import pandas as pd
from datetime import datetime
from babel.dates import format_timedelta


#list_ = []
#z = zipfile.ZipFile('CAX_Train_1.zip')
#allFiles = z.namelist()
#df_from_each_file = (pd.read_csv(z.open(f), index_col=False) for f in allFiles[0:1])
#frame   = pd.concat(df_from_each_file, ignore_index=True)
list_f_ids = ['676200403231928',
 '676200403201250',
 '676200306100458',
 '676200408241718',
 '676200404101643']

st = str(list_f_ids[2])
moddir = 'models_rand_allf/ind_2000run_nocoor/'#'model_2/'
frame = pd.read_csv('some_flights/'+st+'.csv')
#'test_676200312080912.csv'
def to_dattim(y, m, d, h, mi, s):
    return pd.Timestamp(int(y), int(m), int(d), int(h), int(mi), int(s))

a = frame.columns

frame["datetime"] =  frame.apply(lambda row: to_dattim(row['Year'], row['Month'], row['Day'], row['Hour'], row['Minute'], row['Second']), axis=1)

temp = frame.groupby(['Flight_instance_ID']).datetime.first()

def minus_first(x,y,z):
    return y - temp[x]

frame.datetime = frame.apply(lambda row: minus_first(row['Flight_instance_ID'], row['datetime'],temp), axis=1)

7 different phases of flight in the dataset.¶

Most fuel is spent during Climb and Cruise

In [215]:

def plot_pred(Mod_num, ff_id, axi, start,  shuf = False, pred = 1):
    lc = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    zipnam = 'models_best/model'+Mod_num+'/phase'+Mod_num+'_test_separated.zip'
    z = zipfile.ZipFile(zipnam)
    finzip = 'phase'+Mod_num+'_test_separated/phase'+Mod_num+'_'+ff_id+'.csv'
    df = pd.read_csv(z.open(finzip), index_col=False)
    modelpath = "models_best/model"+Mod_num+"/model_new_all_phase_"+Mod_num+".pkd"
    
    if shuf:
        df = df.sample(frac = 1)
    ln = len(df.FF)
    y = df.pop('FF')
    
    ##strd = pd.Timestamp('2017-1-1 00:00:00')+ pd.Timedelta(start, unit='s')
    ##endd = strd + pd.Timedelta(ln-1, unit='s')
    ##x = pd.date_range(start=strd, end=endd, freq = 's')   
    x = range(start,ln +start)
    if Mod_num =='4':
        axi.plot(x, y,'o', color = lc[3], label = 'Climb')
    elif Mod_num =='5':
        axi.plot(x, y,'o', color = lc[4], label = 'Cruise')
    if pred:     
        df = df.drop(['Unnamed: 0.1', 'Flight_instance_ID'], axis = 1)
        xtest = xgb.DMatrix(df, label = y)
        with open(modelpath,'r') as f:
            modelx = dill.load(f)
        pred = modelx.predict(xtest)
        
        if Mod_num =='4':
            axi.plot(x, pred, color = 'black', lw =0.5, label = 'pred. Climb')
        elif Mod_num =='5':
            axi.plot(x, pred, color = 'red', lw =0.5, label = 'pred. Cruise')

    return ln

In [149]:

### plot all phases
import warnings
import sklearn.metrics
import xgboost as xgb
from ipywidgets import StaticInteract, RangeWidget, RadioWidget, DropDownWidget
warnings.filterwarnings('ignore')
lph = ['Unknown','Preflight','Taxi','Takeoff', 'Climb','Cruise','Approach','Rollout']
dat = pd.Timestamp('2017-1-1 00:00:00')
datmax = dat+ frame.datetime.iloc[-1]


def plot_phase(ax,xmi = dat, xma = datmax):
    for key, grp in frame.groupby(['Flight_instance_ID','PH']):
        ph = grp['PH'].head(1).reset_index(drop=True)[0]
        grp['datetime']=grp['datetime'].apply(lambda x: x+dat)
        #print grp.datetime.iloc[0], dat, type(grp.datetime.iloc[0]), type(dat)
        if ph<7:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph])
        else:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph],color='black')
        if ph == 4:
            # take series for 4th phase to plot
            bla = grp['datetime']
        elif ph ==5:
            bla5 = grp['datetime']
    fig.canvas.draw()
    labels = [item.get_text() for item in ax.get_xticklabels()]
    labels = [i[3:] for i in labels]
    labels = labels[1:]
    #labels[1] = 'Testing'
    ax.set_xticklabels(labels, fontsize = 18)
    labels = [item.get_text() for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels, fontsize = 18)

    plt.xticks(rotation=45)
    ax.set_xlim(xmin = xmi, xmax = xma)
    plt.ylabel('Fuel Flow Rate',fontsize = 20)
    plt.xlabel('Time of Flight, [m]',fontsize = 20)

In [193]:

matplotlib.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(16, 12))
plot_phase(ax)
plt.legend(fontsize = 24)
plt.savefig('fig1_allfeatures.png')
plt.show()

Focus analysis on two phases: Climb and Cruise¶

Which seems most promising for saving fuel

In [212]:

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig2_45features.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

To exclude per-flight dependence¶

Shuffle points for each phase
Delete all features which relate data points to flights

In [213]:

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig3_45features_shuf.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

Train model on shuffled training data set¶

I used XGBoost to predict Climbing and Cruise phase
Model trained for every phase independently

In [216]:

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

Can predict unshuffled test set flights with trained model¶

In [217]:

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

In [245]:

import matplotlib
import matplotlib.pyplot as plt
import dill
import zipfile
import pandas as pd
import xgboost as xgb
import glob
import numpy as np

li_test = glob.glob('models_best/model4/phase4_test/residuals_phase4_67*')
liflights_test = []
alltest4 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltest4+= list(se.as_matrix()).append(se.as_matrix())
    liflights_test.append(np.mean(se))

li_test = glob.glob('models_best/model4/phase4_train/residuals_phase4_67*')
liflights_train = []
alltrain4 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltrain4 += list(se.as_matrix())
    liflights_train.append(np.mean(se))


li_test = glob.glob('models_best/model5/phase5_test/residuals_phase5_6*')
liflights_test5 = []
alltest5 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltest5+= list(se.as_matrix())
    liflights_test5.append(np.mean(se))


li_test = glob.glob('models_best/model5/phase5_train/residuals_phase5_6*')
liflights_train5 = []
alltrain5 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltrain5+= list(se.as_matrix())
    liflights_train5.append(np.mean(se))

In-sample and Out-sample error for Climb and Cruise phases¶

Insample error is smaller
One can find flights with large and small fuel consumption for both training and test sets

In [324]:

f, ((ax1), (ax2)) = plt.subplots(1, 2, figsize=(16, 12), sharex='col', sharey='row')

n, bins, patches = ax1.hist(np.array(liflights_test), 25, normed=0, alpha=0.75, label = 'test sample')

n, bins, patches = ax1.hist(np.array(liflights_train), 25, normed=0, alpha=0.75, label = 'train sample')

#m1 = max(liflights_test5)
#m2 = max(liflights_train5)
n, bins, patches = ax2.hist(np.array(liflights_test5), 25, normed=0, alpha=0.75, label = 'test sample')
n, bins, patches = ax2.hist(np.array(liflights_train5), 25, normed=0, alpha=0.75, label = 'train sample')
ax1.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax2.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb')
ax2.set_title('Phase 5: Cruise')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
plt.ylim(ymax=80)
plt.show()

Model error compared with fuel flow rate error per flight¶

Model error is estimated as the standard deviation if the flights would consist of random points.
Flights error is estimated as average difference between prediction and real fuel flow rate per flight

In [322]:

import matplotlib.mlab as mlab
f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(16, 12), sharex='col')#, sharey='row')

#fig = plt.figure(figsize=(6, 8))
#ax1 = fig.add_subplot(2, 2, 1)
mu = 0
variance = np.var(alltrain4)
sigma = np.sqrt(variance)/np.sqrt(len(alltrain4)/602.)
x = np.linspace(-10, 10, 500)
n, bins, patches = ax1.hist(np.array(liflights_train), 25, normed=1, alpha=0.75, label = 'flights error')
ax1.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.03, label = 'model error')

variance = np.var(alltest4)
sigma = np.sqrt(variance)/np.sqrt(len(alltest4)/301.)
x = np.linspace(-100, 100, 500)
n, bins, patches = ax2.hist(np.array(liflights_test), 25, normed=1, alpha=0.75, label = 'flights error')
ax2.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.0075, label = 'model error')

variance = np.var(alltrain5)
sigma = np.sqrt(variance)/np.sqrt(len(alltrain5)/602.)
x = np.linspace(-10, 10, 500)
n, bins, patches = ax3.hist(np.array(liflights_train5), 25, normed=1, alpha=0.75, label = 'flights error')
ax3.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.03, label = 'model error')

variance = np.var(alltest5)
sigma = np.sqrt(variance)/np.sqrt(len(alltest5)/301.)
x = np.linspace(-100, 100, 500)
n, bins, patches = ax4.hist(np.array(liflights_test5), 25, normed=1, alpha=0.75, label = 'flights error')
ax4.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.005, label = 'model error')



#ax1.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
#ax2.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax3.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax4.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax3.legend(fontsize = 24)
ax4.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb, Train sample')
ax2.set_title('Phase 4: Climb, Test sample')
ax3.set_title('Phase 5: Climb, Train sample')
ax4.set_title('Phase 5: Climb, Test sample')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
ax3.tick_params(axis='both', labelsize=18)
ax4.tick_params(axis='both', labelsize=18)
plt.show()

In [325]:

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

Out[325]: