Source code for magpysv.tools

# -*- coding: utf-8 -*-
#    Copyright (C) 2016  Grace Cox (University of Liverpool)
#
#    Released under the MIT license, a copy of which is located at the root of
#    this project.
"""Module containing functions to parse files output by magnetic field models.

Part of the MagPySV package for geomagnetic data analysis. This module provides
various functions to read SV files output by geomagnetic field models.
"""


import pandas as pd
import numpy as np
import os


[docs]def calculate_sv(obs_data, mean_spacing=1):
    """Calculate the secular variation from the observed magnetic field values.

    Uses monthly means of geomagnetic observatory data to calculate the SV
    according to user-specified sampling. The typical choices are monthly
    differences of monthly means and annual differences of monthly means. For
    samplings other than monthly differences, the datetime objects of the
    calculated SV are taken the midpoint of the datetime objects of the field
    data. E.g. differencing the means of the field January 1999 and in January
    2000 yields the SV for July 1999.

    Args:
        obs_data (pandas.DataFrame): dataframe containing means (usually
            monthly) of observed geomagnetic field values.
        mean_spacing (int): the number of months separating the monthly mean
            values used to calculate the SV. Set to 1 (default) to use adjacent
            months of data (first differences of monthly means) or set to 12
            to calculate annual differences of monthly means.

    Returns:
        obs_sv (pandas.DataFrame): dataframe containing SV data.
    """
    obs_sv = pd.DataFrame()
    # Set the SV month to halfway between the MF measurement dates
    obs_sv['date'] = obs_data['date'] - pd.tseries.offsets.DateOffset(
        months=int(np.floor(mean_spacing/2)), day=1)
    if mean_spacing % 2 == 1:
        obs_sv['date'] = obs_sv['date'].apply(lambda dte: dte.replace(day=1))
    else:
        obs_sv['date'] = obs_sv['date'].apply(lambda dte: dte.replace(day=15))

    # Calculate scale required to give SV in nT/yr
    scaling_factor = 12 / mean_spacing
    obs_sv['dx'] = scaling_factor * obs_data['X'].diff(periods=mean_spacing)
    obs_sv['dy'] = scaling_factor * obs_data['Y'].diff(periods=mean_spacing)
    obs_sv['dz'] = scaling_factor * obs_data['Z'].diff(periods=mean_spacing)
    obs_sv.drop(obs_sv.head(mean_spacing).index, inplace=True)

    return obs_sv


[docs]def calculate_residuals(*, obs_data, model_data):
    """Calculate SV residuals (observed - prediction) using datetime objects.

    Args:
        obs_data (pandas.DataFrame): dataframe containing means (usually
            monthly) of SV calculated from observed geomagnetic field values.
        model_data (pandas.DataFrame): dataframe containing the SV predicted by
            a geomagnetic field model.

    Returns:
        residuals (pandas.DataFrame): dataframe containing SV residuals.
    """
    dates = obs_data['date']
    model_data = model_data[model_data['date'].isin(dates)]
    obs_data.drop(obs_data.columns[[0]], axis=1, inplace=True)
    model_data.drop(model_data.columns[[0]], axis=1, inplace=True)

    # Calculate SV residuals (data - model prediction) for all observatories
    residuals = pd.DataFrame(
        obs_data.values - model_data.values,
        columns=obs_data.columns)
    obs_data.insert(0, 'date', dates)

    return residuals


[docs]def data_resampling(data, sampling='MS', average_date=True):
    """Resample the hourly geomagnetic data to a specified frequency.

    Args:
        data (pandas.DataFrame): dataframe containing datetime objects and
            hourly means of magnetic data.
        sampling (str): new sampling frequency. Default value is 'MS'
            (monthly means), which averages data for each month and sets
            the datetime object to the first day of that month. Use 'M'
            to set the datetime object to the final day of the month.
            Another useful option is 'A' (annual means), which averages
            data for a whole year and sets the datetime object to the final
            day of the year. Use 'AS' to set the datetime object to the
            first day of the year.
        average_date (bool): the specified resampling intervals only have
            options for setting the date to the first ('MS' and 'AS') or final
            ('M' and 'A') day of the month or year. For monthly averages, a
            more appropriate representative date is the middle of that month
            (i.e. the 15th day of the month). For annual averages, an
            appropriate representative date is the middle of that year (taken
            as July 1st of the year.) This option is used to set the dates to
            the centre of the sampling interval. Defaults to True.

    Returns:
        data (pandas.DataFrame):
            dataframe of datetime objects and monthly/annual means of
            observatory data.
    """
    resampled = data.set_index('date', drop=False).resample(sampling).mean()
    resampled.reset_index(inplace=True)
    if average_date is True and 'A' in sampling:
        resampled['date'] = resampled['date'] + pd.tseries.offsets.DateOffset(
                            month=7, day=1)
    if average_date is True and 'M' in sampling:
        resampled['date'] = resampled['date'] + pd.tseries.offsets.DateOffset(
                            day=15)
    return resampled


[docs]def apply_Ap_threshold(*, Ap_file=None, obs_data, threshold):
    """Remove observatory data for times with ap values above threshold value.

    Args:
        Ap_file (str): path to file containing hourly values for the ap index.
        obs_data (pandas.DataFrame): dataframe containing hourly means of
            observed geomagnetic field values.
        threshold (int): the threshold Ap value. Data for days with a higher Ap
            value will be replaced with NaNs and omitted from monthly (or
            annual etc) means.

    Returns:
        obs_data (pandas.DataFrame): data with ap threshold applied.
    """
    col_names = ['date', 'Ap']
    Ap_hourly = pd.read_csv(Ap_file, names=col_names,
                            dtype={'date': 'str', 'Ap': 'float'},
                            parse_dates=[0], skiprows=0, index_col=None)
    # Merge the two dataframes so that only dates contained within both are
    # retained
    obs_data = pd.merge(obs_data, Ap_hourly, on='date', how='inner')
    # Use the threshold to discard data from days with high external field
    # activity
    obs_data.loc[obs_data.Ap > threshold, 'X'] = np.NaN
    obs_data.loc[obs_data.Ap > threshold, 'Y'] = np.NaN
    obs_data.loc[obs_data.Ap > threshold, 'Z'] = np.NaN
    obs_data.drop(['Ap'], axis=1, inplace=True)

    return obs_data


[docs]def remove_selected_points(*, data, fname):
    """Remove specified points from dataset based on list of points in a file.

    Reads a list of unwanted points from a file and removes them from the
    dataframe if present. E.g. If the user had monthly SV means and wished to
    exclude the X value at NGK in January 2015 from the analysis, the following
    line would be written in a file

    2015-01-01,ngk,X

    It is preferable (and more repeatable) to use the included outlier
    detection algorithms to remove outliers.

    Args:
        data (pandas.DataFrame): dataframe containing datetime objects and
              daily means of magnetic data.
        fname (str): path to file containing a list of unwanted data points.

    Returns:
        data (pandas.DataFrame): the same dataframe with the points removed.
    """
    col_names = ['date', 'observatory', 'component']
    points = pd.read_csv(fname, sep=',', header=0, names=col_names,
                         parse_dates=[0])

    for data_point in points.itertuples():
        col_name = data_point.component + '_' + data_point.observatory
        try:
            data.loc[data['date'] == data_point.date, col_name] = np.nan
        except KeyError:
            pass

    return data


[docs]def calculate_sv_index(obs_data, mean_spacing=1):
    """Calculate the secular variation of a geomagnetic index.

    Uses monthly means of a geomagnetic index (e.g. Dst) to calculate the SV
    according to user-specified sampling. The typical choices are monthly
    differences of monthly means and annual differences of monthly means. For
    samplings other than monthly differences, the datetime objects of the
    calculated SV are taken the midpoint of the datetime objects of the field
    data. E.g. differencing the means of the field January 1999 and in January
    2000 yields the SV for July 1999.

    Args:
        obs_data (pandas.DataFrame): dataframe containing means (usually
            monthly) of observed geomagnetic field values.
        mean_spacing (int): the number of months separating the monthly mean
            values used to calculate the SV. Set to 1 (default) to use adjacent
            months of data (first differences of monthly means) or set to 12 to
            calculate annual differences of monthly means.

    Returns:
        obs_sv (pandas.DataFrame): dataframe containing SV data.
    """
    obs_sv = pd.DataFrame()
    # Set the SV month to halfway between the MF measurement dates
    obs_sv['date'] = obs_data['date'] - pd.tseries.offsets.DateOffset(
        months=int(np.floor(mean_spacing/2)), day=1)
    if mean_spacing % 2 == 1:
        obs_sv['date'] = obs_sv['date'].apply(lambda x: x.replace(day=1))
    else:
        obs_sv['date'] = obs_sv['date'].apply(lambda x: x.replace(day=15))
    # Calculate scale required to give SV in nT/yr
    scaling_factor = 12 / mean_spacing
    obs_sv['mean'] = scaling_factor * obs_data['mean'].diff(
            periods=mean_spacing)
    obs_sv.drop(obs_sv.head(mean_spacing).index, inplace=True)

    return obs_sv


[docs]def calculate_correlation_index(*, dates, signal, index_file):
    """Calculate correlation coefficient between signal and a geomagnetic index

    Args:
        dates (datetime.datetime): dates of time series measurements.
        signal (time series): data to be compared with geomagnetic index.
        index_file (str): path to file containing geomagnetic index.

    Returns:
        (tuple): tuple containing:

        - coeff (*float*):
            correlation coefficient.
        - merged (*pandas.DataFrame*):
            dataframe containing dates, signal and the geomagnetic index.
    """
    col_names = ['date', 'index_vals']
    index_df = pd.read_csv(index_file, names=col_names,
                           dtype={'date': 'str', 'index_vals': 'float'},
                           parse_dates=[0], skiprows=1, index_col=None)
    df = pd.DataFrame({'date': dates, 'proxy': signal})
    # Merge the two dataframes so that only dates contained within both are
    # retained
    merged = pd.merge(df.dropna(), index_df.dropna(), on='date', how='inner')
    coeff = np.corrcoef(merged.index_vals, merged.proxy)

    return np.abs(coeff.data[0, 1]), merged


[docs]def get_baseline_info(*, fname=None):
    """Read documented baseline changes from a file.
    Args:
        fname (str): location of file containing documented baseline changes.

    Returns:
        data (pandas.DataFrame): baseline change data.
    """
    if fname is None:
        fname = os.path.join(os.path.dirname(__file__), 'baseline_records')

    col_names = ['observatory', 'jump_year', 'x_jump', 'y_jump', 'z_jump']
    data = pd.read_csv(fname, sep=',', names=col_names)
    data['jump_year'] = pd.to_datetime(dict(year=data['jump_year'],
                                       month=1, day=1))
    return data


[docs]def correct_baseline_change(*, observatory, field_data, baseline_data,
                            print_data):
    """Correct documented baseline changes.

    Args:
        observatory (str): observatory name given as a 3-digit IAGA code.
        field_data (pandas.DataFrame): uncorrected magnetic field data.
        baseline_data (pandas.DataFrame): baseline discontinuity data in the
            format output by get_baseline_info.
        print_data (bool): option to print the corrections made.
    """
    # Extract baseline data for the specified observatory
    obs_jumps = baseline_data.loc[
        baseline_data['observatory'] == str.upper(observatory)]

    if print_data is True:
        print(obs_jumps)
    # Loop over a baseline changes and make the correction
    for jump in obs_jumps.index:
        # Check if a change of zero or 99999 is recorded (both values used for
        # unknown baseline changes)
        if (obs_jumps.loc[jump].x_jump == 0 & obs_jumps.loc[jump].y_jump == 0 &
                obs_jumps.loc[jump].z_jump == 0):
            print("Baseline change of unknown magnitude: ",
                  obs_jumps.loc[jump].jump_year)
        elif (obs_jumps.loc[jump].x_jump == 99999 &
                obs_jumps.loc[jump].y_jump == 99999 &
                obs_jumps.loc[jump].z_jump == 99999):
            print("Baseline change of unknown magnitude: ",
                  obs_jumps.loc[jump].jump_year)
        else:
            field_data.loc[field_data['date'] < obs_jumps.loc[jump].jump_year,
                           'X'] = field_data.loc[
                field_data['date'] < obs_jumps.loc[jump].jump_year, 'X'] - \
                obs_jumps.loc[jump].x_jump
            field_data.loc[field_data['date'] < obs_jumps.loc[jump].jump_year,
                           'Y'] = field_data.loc[
                field_data['date'] < obs_jumps.loc[jump].jump_year, 'Y'] - \
                obs_jumps.loc[jump].y_jump
            field_data.loc[field_data['date'] < obs_jumps.loc[jump].jump_year,
                           'Z'] = field_data.loc[
                field_data['date'] < obs_jumps.loc[jump].jump_year, 'Z'] - \
                obs_jumps.loc[jump].z_jump