Predict fuel flow rate of airplanes¶

In this Project I am going to predicti fuel flow rate of airplanes during different phases of flight.

The dataset is 12 Gb and 1000 flights¶

import matplotlib.pyplot as plt
import matplotlib.cm
import pandas as pd 
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize
import warnings
warnings.filterwarnings('ignore')

fig, ax = plt.subplots(figsize=(16,12))
# coordinates are taken from http://boundingbox.klokantech.com/

coorlist = [-133.15,24.29,-61.52,54.37]
rout_color = (204/255.0, 0, 153/255.0, 0.7)#"orange"
bg_color = (0.0, 0.0, 0, 0.0)
coast_color = (0.0, 0.0, 0, 1.0)
#coast_color = (204/255.0, 0, 153/255.0, 0.7)
fl_color = (204/255.0, 0, 153/255.0, 0.7)
m = Basemap(resolution='l', # c, l, i, h, f or None
            projection='merc',
            llcrnrlon=coorlist[0], llcrnrlat= coorlist[1], urcrnrlon=coorlist[2], urcrnrlat=coorlist[3])
m.drawcountries()
m.drawcoastlines(color=coast_color, linewidth=1.0)
m.fillcontinents(color=bg_color)
m.drawmapboundary(fill_color=bg_color)
m.shadedrelief()
path = pd.read_csv('ALL_data.csv', index_col=False)
resmap = path.groupby(['Flight_instance_ID'])

for ni, pos in enumerate(resmap):
    bla = pos[1]
    x1 = list(bla.LONP)
    y1 = list(bla.LATP)
    x, y = m(x1,y1)
    # in case of out of box, gives an error
    try:
        m.drawgreatcircle(x1[0], y1[0], x1[-1], y1[-1], linewidth=0.75, color="red", alpha = 0.2)
        m.plot(x, y, 'o', markersize='6', color='#444444', alpha=0.1)
    except:
        pass
plt.show()

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import pandas as pd
from datetime import datetime
from babel.dates import format_timedelta


#list_ = []
#z = zipfile.ZipFile('CAX_Train_1.zip')
#allFiles = z.namelist()
#df_from_each_file = (pd.read_csv(z.open(f), index_col=False) for f in allFiles[0:1])
#frame   = pd.concat(df_from_each_file, ignore_index=True)
list_f_ids = ['676200403231928',
 '676200403201250',
 '676200306100458',
 '676200408241718',
 '676200404101643']

st = str(list_f_ids[2])
moddir = 'models_rand_allf/ind_2000run_nocoor/'#'model_2/'
frame = pd.read_csv('some_flights/'+st+'.csv')
#'test_676200312080912.csv'
def to_dattim(y, m, d, h, mi, s):
    return pd.Timestamp(int(y), int(m), int(d), int(h), int(mi), int(s))

a = frame.columns

frame["datetime"] =  frame.apply(lambda row: to_dattim(row['Year'], row['Month'], row['Day'], row['Hour'], row['Minute'], row['Second']), axis=1)

temp = frame.groupby(['Flight_instance_ID']).datetime.first()

def minus_first(x,y,z):
    return y - temp[x]

frame.datetime = frame.apply(lambda row: minus_first(row['Flight_instance_ID'], row['datetime'],temp), axis=1)

There are 8 different phases of flight in the dataset.¶

There are around 220 different parameters¶

I use XGBoost to predict Climbing phase.

def plot_pred(Mod_num, ff_id, axi, start,  shuf = False, pred = 1):
    lc = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    zipnam = 'models_best/model'+Mod_num+'/phase'+Mod_num+'_test_separated.zip'
    z = zipfile.ZipFile(zipnam)
    finzip = 'phase'+Mod_num+'_test_separated/phase'+Mod_num+'_'+ff_id+'.csv'
    df = pd.read_csv(z.open(finzip), index_col=False)
    modelpath = "models_best/model"+Mod_num+"/model_new_all_phase_"+Mod_num+".pkd"
    
    if shuf:
        df = df.sample(frac = 1)
    ln = len(df.FF)
    y = df.pop('FF')
    
    ##strd = pd.Timestamp('2017-1-1 00:00:00')+ pd.Timedelta(start, unit='s')
    ##endd = strd + pd.Timedelta(ln-1, unit='s')
    ##x = pd.date_range(start=strd, end=endd, freq = 's')   
    x = range(start,ln +start)
    if Mod_num =='4':
        axi.plot(x, y,'o', color = lc[3], label = 'Climb')
    elif Mod_num =='5':
        axi.plot(x, y,'o', color = lc[4], label = 'Cruise')
    if pred:     
        df = df.drop(['Unnamed: 0.1', 'Flight_instance_ID'], axis = 1)
        xtest = xgb.DMatrix(df, label = y)
        with open(modelpath,'r') as f:
            modelx = dill.load(f)
        pred = modelx.predict(xtest)
        
        if Mod_num =='4':
            axi.plot(x, pred, color = 'black', lw =0.5, label = 'pred. Climb')
        elif Mod_num =='5':
            axi.plot(x, pred, color = 'red', lw =0.5, label = 'pred. Cruise')

    return ln

### plot all phases
import warnings
import sklearn.metrics
import xgboost as xgb
from ipywidgets import StaticInteract, RangeWidget, RadioWidget, DropDownWidget
warnings.filterwarnings('ignore')
lph = ['Unknown','Preflight','Taxi','Takeoff', 'Climb','Cruise','Approach','Rollout']
dat = pd.Timestamp('2017-1-1 00:00:00')
datmax = dat+ frame.datetime.iloc[-1]


def plot_phase(ax,xmi = dat, xma = datmax):
    for key, grp in frame.groupby(['Flight_instance_ID','PH']):
        ph = grp['PH'].head(1).reset_index(drop=True)[0]
        grp['datetime']=grp['datetime'].apply(lambda x: x+dat)
        #print grp.datetime.iloc[0], dat, type(grp.datetime.iloc[0]), type(dat)
        if ph<7:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph])
        else:
            ax.plot(grp['datetime'], grp['FF'],'o', lw = 2., label = lph[ph],color='black')
        if ph == 4:
            # take series for 4th phase to plot
            bla = grp['datetime']
        elif ph ==5:
            bla5 = grp['datetime']
    fig.canvas.draw()
    labels = [item.get_text() for item in ax.get_xticklabels()]
    labels = [i[3:] for i in labels]
    labels = labels[1:]
    #labels[1] = 'Testing'
    ax.set_xticklabels(labels, fontsize = 18)
    labels = [item.get_text() for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels, fontsize = 18)

    plt.xticks(rotation=45)
    ax.set_xlim(xmin = xmi, xmax = xma)
    plt.ylabel('Fuel Flow Rate',fontsize = 20)
    plt.xlabel('Time of Flight, [m]',fontsize = 20)

matplotlib.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(16, 12))
plot_phase(ax)
plt.legend(fontsize = 24)
plt.savefig('fig1_allfeatures.png')
plt.show()

Focus on two Phases¶

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig2_45features.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

Shuffle points for each phase to exclude per-flight dependence¶

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 0)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 0)


plt.legend(fontsize = 24)
plt.savefig('fig3_45features_shuf.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

Train model on shuffled Training data set for each phase¶

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 1, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 1, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

The fuel flow rate for the flight from training set and the model prediciton¶

fig, ax = plt.subplots(figsize=(16, 12))

a = plot_pred('4', st,  ax, start = 0, shuf = 0, pred = 1)
plot_pred(    '5', st,  ax, start = a, shuf = 0, pred = 1)


plt.legend(fontsize = 24)
plt.savefig('fig4_45features_shuf_pred.png')
fig.canvas.draw()

#labels = [item.get_text() for item in ax.get_xticklabels()]
#labels = [i for i in labels]
#labels = labels[1:]
ax.set_xticklabels([str(i) for i in range(-1000,4001,1000)], fontsize = 18)

labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(labels, fontsize = 18)
   
plt.xticks(rotation=45)
#ax.set_xlim(xmin = pd.Timestamp('2017-1-1 00:00:00'))
plt.ylabel('Fuel Flow Rate',fontsize = 20)
plt.xlabel('Time of Flight, [m]',fontsize = 20)
plt.show()

import matplotlib
import matplotlib.pyplot as plt
import dill
import zipfile
import pandas as pd
import xgboost as xgb
import glob
import numpy as np

li_test = glob.glob('models_best/model4/phase4_test/residuals_phase4_67*')
liflights_test = []
alltest4 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltest4+= list(se.as_matrix()).append(se.as_matrix())
    liflights_test.append(np.mean(se))

li_test = glob.glob('models_best/model4/phase4_train/residuals_phase4_67*')
liflights_train = []
alltrain4 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltrain4 += list(se.as_matrix())
    liflights_train.append(np.mean(se))


li_test = glob.glob('models_best/model5/phase5_test/residuals_phase5_6*')
liflights_test5 = []
alltest5 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltest5+= list(se.as_matrix())
    liflights_test5.append(np.mean(se))


li_test = glob.glob('models_best/model5/phase5_train/residuals_phase5_6*')
liflights_train5 = []
alltrain5 = []
for fn in li_test:
    df = pd.read_csv(fn)
    se = df.FF-df.Prediction
    alltrain5+= list(se.as_matrix())
    liflights_train5.append(np.mean(se))

In-sample and Out-sample error for Climb and Cruise phases¶

Insample error is smaller¶

We can find flights with large and small fuel consumption for both training and test sets¶

f, ((ax1), (ax2)) = plt.subplots(1, 2, figsize=(16, 12), sharex='col', sharey='row')

n, bins, patches = ax1.hist(np.array(liflights_test), 25, normed=0, alpha=0.75, label = 'test sample')

n, bins, patches = ax1.hist(np.array(liflights_train), 25, normed=0, alpha=0.75, label = 'train sample')

#m1 = max(liflights_test5)
#m2 = max(liflights_train5)
n, bins, patches = ax2.hist(np.array(liflights_test5), 25, normed=0, alpha=0.75, label = 'test sample')
n, bins, patches = ax2.hist(np.array(liflights_train5), 25, normed=0, alpha=0.75, label = 'train sample')
ax1.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax2.set_xlabel('(Pr. - F_R) for each fligh',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb')
ax2.set_title('Phase 5: Cruise')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
plt.ylim(ymax=80)
plt.show()

Model error compared with fuel flow rate error per flight¶

Model error is estimated as the standard deviation if the flights would consist of random points.¶

Flights error is estimated as average difference between prediction and real fuel flow rate per flight¶

import matplotlib.mlab as mlab
f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(16, 12), sharex='col')#, sharey='row')

#fig = plt.figure(figsize=(6, 8))
#ax1 = fig.add_subplot(2, 2, 1)
mu = 0
variance = np.var(alltrain4)
sigma = np.sqrt(variance)/np.sqrt(len(alltrain4)/602.)
x = np.linspace(-10, 10, 500)
n, bins, patches = ax1.hist(np.array(liflights_train), 25, normed=1, alpha=0.75, label = 'flights error')
ax1.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.03, label = 'model error')

variance = np.var(alltest4)
sigma = np.sqrt(variance)/np.sqrt(len(alltest4)/301.)
x = np.linspace(-100, 100, 500)
n, bins, patches = ax2.hist(np.array(liflights_test), 25, normed=1, alpha=0.75, label = 'flights error')
ax2.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.0075, label = 'model error')

variance = np.var(alltrain5)
sigma = np.sqrt(variance)/np.sqrt(len(alltrain5)/602.)
x = np.linspace(-10, 10, 500)
n, bins, patches = ax3.hist(np.array(liflights_train5), 25, normed=1, alpha=0.75, label = 'flights error')
ax3.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.03, label = 'model error')

variance = np.var(alltest5)
sigma = np.sqrt(variance)/np.sqrt(len(alltest5)/301.)
x = np.linspace(-100, 100, 500)
n, bins, patches = ax4.hist(np.array(liflights_test5), 25, normed=1, alpha=0.75, label = 'flights error')
ax4.plot(x,mlab.normpdf(x, mu, sigma)/0.09*0.005, label = 'model error')



#ax1.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
#ax2.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax3.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax4.set_xlabel('Prediction - Fuel_Rate',fontsize = 18)
ax1.legend(fontsize = 24)
ax2.legend(fontsize = 24)
ax3.legend(fontsize = 24)
ax4.legend(fontsize = 24)
ax1.set_ylabel('freq. of flights in bin',fontsize = 18)
ax1.set_title('Phase 4: Climb, Train sample')
ax2.set_title('Phase 4: Climb, Test sample')
ax3.set_title('Phase 5: Climb, Train sample')
ax4.set_title('Phase 5: Climb, Test sample')
ax1.tick_params(axis='both', labelsize=18)
ax2.tick_params(axis='both', labelsize=18)
ax3.tick_params(axis='both', labelsize=18)
ax4.tick_params(axis='both', labelsize=18)
plt.show()

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')