Predict fuel flow rate of airplanes

  • There are around 220 different parameters
  • The dataset is 12 Gb and 1000 flights
  • Most of the flights are domestic US flights
In [327]:
import matplotlib.pyplot as plt
import matplotlib.cm
import pandas as pd 
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize
import warnings
warnings.filterwarnings('ignore')

fig, ax = plt.subplots(figsize=(16,12))
# coordinates are taken from http://boundingbox.klokantech.com/

coorlist = [-133.15,24.29,-61.52,54.37]
rout_color = (204/255.0, 0, 153/255.0, 0.7)#"orange"
bg_color = (0.0, 0.0, 0, 0.0)
coast_color = (0.0, 0.0, 0, 1.0)
#coast_color = (204/255.0, 0, 153/255.0, 0.7)
fl_color = (204/255.0, 0, 153/255.0, 0.7)
m = Basemap(resolution='l', # c, l, i, h, f or None
            projection='merc',
            llcrnrlon=coorlist[0], llcrnrlat= coorlist[1], urcrnrlon=coorlist[2], urcrnrlat=coorlist[3])
m.drawcountries()
m.drawcoastlines(color=coast_color, linewidth=1.0)
m.fillcontinents(color=bg_color)
m.drawmapboundary(fill_color=bg_color)
m.shadedrelief()
path = pd.read_csv('ALL_data.csv', index_col=False)
resmap = path.groupby(['Flight_instance_ID'])

for ni, pos in enumerate(resmap):
    bla = pos[1]
    x1 = list(bla.LONP)
    y1 = list(bla.LATP)
    x, y = m(x1,y1)
    # in case of out of box, gives an error
    try:
        m.drawgreatcircle(x1[0], y1[0], x1[-1], y1[-1], linewidth=0.75, color="red", alpha = 0.2)
        m.plot(x, y, 'o', markersize='6', color='#444444', alpha=0.1)
    except:
        pass
plt.show()
In [147]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import pandas as pd
from datetime import datetime
from babel.dates import format_timedelta


#list_ = []
#z = zipfile.ZipFile('CAX_Train_1.zip')
#allFiles = z.namelist()
#df_from_each_file = (pd.read_csv(z.open(f), index_col=False) for f in allFiles[0:1])
#frame   = pd.concat(df_from_each_file, ignore_index=True)
list_f_ids = ['676200403231928',
 '676200403201250',
 '676200306100458',
 '676200408241718',
 '676200404101643']

st = str(list_f_ids[2])
moddir = 'models_rand_allf/ind_2000run_nocoor/'#'model_2/'
frame = pd.read_csv('some_flights/'+st+'.csv')
#'test_676200312080912.csv'
def to_dattim(y, m, d, h, mi, s):
    return pd.Timestamp(int(y), int(m), int(d), int(h), int(mi), int(s))

a = frame.columns

frame["datetime"] =  frame.apply(lambda row: to_dattim(row['Year'], row['Month'], row['Day'], row['Hour'], row['Minute'], row['Second']), axis=1)

temp = frame.groupby(['Flight_instance_ID']).datetime.first()

def minus_first(x,y,z):
    return y - temp[x]

frame.datetime = frame.apply(lambda row: minus_first(row['Flight_instance_ID'], row['datetime'],temp), axis=1)

7 different phases of flight in the dataset.

  • Most fuel is spent during Climb and Cruise
In [215]:
def plot_pred(Mod_num, ff_id, axi, start,  shuf = False, pred = 1):
    lc = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    zipnam = 'models_best/model'+Mod_num+'/phase'+Mod_num+'_test_separated.zip'
    z = zipfile.ZipFile(zipnam)
    finzip = 'phase'+Mod_num+'_test_separated/phase'+Mod_num+'_'+ff_id+'.csv'
    df = pd.read_csv(z.open(finzip), index_col=False)
    modelpath = "models_best/model"+Mod_num+"/model_new_all_phase_"+Mod_num+".pkd"
    
    if shuf:
        df = df.sample(frac = 1)
    ln = len(df.FF)
    y = df.pop('FF')
    
    ##strd = pd.Timestamp('2017-1-1 00:00:00')+ pd.Timedelta(start, unit='s')
    ##endd = strd + pd.Timedelta(ln-1, unit='s')
    ##x = pd.date_range(start=strd, end=endd, freq = 's')   
    x = range(start,ln +start)
    if Mod_num =='4':
        axi.plot(x, y,'o', color = lc[3], label = 'Climb')
    elif Mod_num =='5':
        axi.plot(x, y,'o', color = lc[4], label = 'Cruise')
    if pred:     
        df = df.drop(['Unnamed: 0.1', 'Flight_instance_ID'], axis = 1)
        xtest = xgb.DMatrix(df, label = y)
        with open(modelpath,'r') as f:
            modelx = dill.load(f)
        pred = modelx.predict(xtest)
        
        if Mod_num =='4':
            axi.plot(x, pred, color = 'black', lw =0.5, label = 'pred. Climb')
        elif Mod_num =='5':
            axi.plot(x, pred, color = 'red', lw =0.5, label = 'pred. Cruise')

    return ln
In [149]:
### plot all phases
import warnings
import sklearn.metrics
import xgboost as xgb
from ipywidgets import StaticInteract, RangeWidget, RadioWidget, DropDownWidget
warnings.filterwarnings('ignore')
lph = ['Unknown','Preflight','Taxi','Takeoff', 'Climb','Cruise','Approach','Rollout']
dat = pd.Timestamp('2017-1-1 00:00:00')
datmax = dat+ frame.datetime.iloc[-1]


def plot_phase(ax,xmi = dat, xma = datmax):
    for key, grp in frame.groupby(['Flight_instance_ID','PH']):
        ph = grp['PH'].head(1).reset_index(drop=True)[0]
        grp['datetime']=grp['datetime'].apply(lambda x: x+dat)
        #print grp.datetime.iloc[0], dat, type(grp.datetime.iloc[0]), type(dat)
        if ph<7:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph])
        else:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph],color='black')
        if ph == 4:
            # take series for 4th phase to plot
            bla = grp['datetime']
        elif ph ==5:
            bla5 = grp['datetime']
    fig.canvas.draw()
    labels = [item.get_text() for item in ax.get_xticklabels()]
    labels = [i[3:] for i in labels]
    labels = labels[1:]
    #labels[1] = 'Testing'
    ax.set_xticklabels(labels, fontsize = 18)
    labels = [item.get_text() for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels, fontsize = 18)

    plt.xticks(rotation=45)
    ax.set_xlim(xmin = xmi, xmax = xma)
    plt.ylabel('Fuel Flow Rate',fontsize = 20)
    plt.xlabel('Time of Flight, [m]',fontsize = 20)
In [193]:
matplotlib.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(16, 12))
plot_phase(ax)
plt.legend(fontsize = 24)
plt.savefig('fig1_allfeatures.png')
plt.show()

Focus analysis on two phases: Climb and Cruise

  • Which seems most promising for saving fuel
In [212]:
fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig2_45features.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

To exclude per-flight dependence

  • Shuffle points for each phase
  • Delete all features which relate data points to flights
In [213]:
fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig3_45features_shuf.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

Train model on shuffled training data set

  • I used XGBoost to predict Climbing and Cruise phase
  • Model trained for every phase independently
In [216]:
fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

Can predict unshuffled test set flights with trained model

In [217]:
fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()
In [245]:
import matplotlib
import matplotlib.pyplot as plt
import dill
import zipfile
import pandas as pd
import xgboost as xgb
import glob
import numpy as np

li_test = glob.glob('models_best/model4/phase4_test/residuals_phase4_67*')
liflights_test = []
alltest4 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltest4+= list(se.as_matrix()).append(se.as_matrix())
    liflights_test.append(np.mean(se))

li_test = glob.glob('models_best/model4/phase4_train/residuals_phase4_67*')
liflights_train = []
alltrain4 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltrain4 += list(se.as_matrix())
    liflights_train.append(np.mean(se))


li_test = glob.glob('models_best/model5/phase5_test/residuals_phase5_6*')
liflights_test5 = []
alltest5 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltest5+= list(se.as_matrix())
    liflights_test5.append(np.mean(se))


li_test = glob.glob('models_best/model5/phase5_train/residuals_phase5_6*')
liflights_train5 = []
alltrain5 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltrain5+= list(se.as_matrix())
    liflights_train5.append(np.mean(se))

In-sample and Out-sample error for Climb and Cruise phases

  • Insample error is smaller
  • One can find flights with large and small fuel consumption for both training and test sets
In [324]:
f, ((ax1), (ax2)) = plt.subplots(1, 2, figsize=(16, 12), sharex='col', sharey='row')

n, bins, patches = ax1.hist(np.array(liflights_test), 25, normed=0, alpha=0.75, label = 'test sample')

n, bins, patches = ax1.hist(np.array(liflights_train), 25, normed=0, alpha=0.75, label = 'train sample')

#m1 = max(liflights_test5)
#m2 = max(liflights_train5)
n, bins, patches = ax2.hist(np.array(liflights_test5), 25, normed=0, alpha=0.75, label = 'test sample')
n, bins, patches = ax2.hist(np.array(liflights_train5), 25, normed=0, alpha=0.75, label = 'train sample')
ax1.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax2.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb')
ax2.set_title('Phase 5: Cruise')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
plt.ylim(ymax=80)
plt.show()

Model error compared with fuel flow rate error per flight

  • Model error is estimated as the standard deviation if the flights would consist of random points.
  • Flights error is estimated as average difference between prediction and real fuel flow rate per flight
In [322]:
import matplotlib.mlab as mlab
f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(16, 12), sharex='col')#, sharey='row')

#fig = plt.figure(figsize=(6, 8))
#ax1 = fig.add_subplot(2, 2, 1)
mu = 0
variance = np.var(alltrain4)
sigma = np.sqrt(variance)/np.sqrt(len(alltrain4)/602.)
x = np.linspace(-10, 10, 500)
n, bins, patches = ax1.hist(np.array(liflights_train), 25, normed=1, alpha=0.75, label = 'flights error')
ax1.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.03, label = 'model error')

variance = np.var(alltest4)
sigma = np.sqrt(variance)/np.sqrt(len(alltest4)/301.)
x = np.linspace(-100, 100, 500)
n, bins, patches = ax2.hist(np.array(liflights_test), 25, normed=1, alpha=0.75, label = 'flights error')
ax2.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.0075, label = 'model error')

variance = np.var(alltrain5)
sigma = np.sqrt(variance)/np.sqrt(len(alltrain5)/602.)
x = np.linspace(-10, 10, 500)
n, bins, patches = ax3.hist(np.array(liflights_train5), 25, normed=1, alpha=0.75, label = 'flights error')
ax3.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.03, label = 'model error')

variance = np.var(alltest5)
sigma = np.sqrt(variance)/np.sqrt(len(alltest5)/301.)
x = np.linspace(-100, 100, 500)
n, bins, patches = ax4.hist(np.array(liflights_test5), 25, normed=1, alpha=0.75, label = 'flights error')
ax4.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.005, label = 'model error')



#ax1.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
#ax2.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax3.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax4.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax3.legend(fontsize = 24)
ax4.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb, Train sample')
ax2.set_title('Phase 4: Climb, Test sample')
ax3.set_title('Phase 5: Climb, Train sample')
ax4.set_title('Phase 5: Climb, Test sample')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
ax3.tick_params(axis='both', labelsize=18)
ax4.tick_params(axis='both', labelsize=18)
plt.show()
In [325]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[325]: