Outlier Detection Again

03 Jul 2018

Yes, same old, nothing new, just tidied the code up a little for linking to presto, and added labelling for the outliers.

import pandas as pd
import numpy as np
from pyculiarity import detect_ts
import matplotlib.pyplot as plt
from pyhive import presto

def extract_from_presto(query):
    # connect to presto
    cursor =  presto.connect(
        host='', # insert your own credentials here!!
    # query from presto
    rows = cursor.fetchall()
    # put into dataframe
    df = pd.DataFrame(rows, columns=['cob', 'value'])
    return df

def run_model(df):
    # convert cob string to datetime, then to unix timestamp integer
    df['cob'] = pd.to_datetime(df['cob'])
    df = df.sort_values('cob')
    df['cob'] = df['cob'].astype(np.int64) // 10**9
    # run the model
    results = detect_ts(df, max_anoms=0.005, alpha=0.001, direction='both')
    # format the unix timestamp integer back to datetime
    df['cob'] = pd.to_datetime(df['cob'])
    return df, results

def plot_graphs(df, results, destination_filepath):
    # plot the graphs
    plt.title('Detected Anomalies')
    ax = plt.plot(df['cob'], df['value'], 'b')
    ax = plt.plot(results['anoms'].index, results['anoms']['anoms'], 'ro')

    labels = [x.strftime('%d-%b-%Y') for x in list(results['anoms'].index)]
    coordinates = zip(results['anoms'].index, results['anoms']['anoms'])

    for label, coordinates in zip(labels, coordinates):
        plt.annotate(label, coordinates)

    # save to filepath

if __name__ == '__main__':

    query = """
    SELECT cob, value
    FROM tablename

    destination_filepath = 'graph_name.png'

    # extract data
    df = extract_from_presto(query)
    # run model and return results
    df, results = run_model(df)
    # plot graph
    plot_graphs(df, results, destination_filepath)