Source code for magpysv.io

# -*- coding: utf-8 -*-
#    Copyright (C) 2016  Grace Cox (University of Liverpool)
#
#    Released under the MIT license, a copy of which is located at the root of
#    this project.
"""Module containing functions to parse World Data Centre (WDC) files.

Part of the MagPySV package for geomagnetic data analysis. This module provides
various functions to read, parse and manipulate the contents of World Data
Centre (WDC) formatted files containing geomagnetic data and output data to
comma separated values (CSV) files. Also contains functions to read output of
code used for the COV-OBS magnetic field model series by Gillet et al. (links
below).
"""


import datetime as dt
import glob
import os
import pandas as pd
import numpy as np


[docs]def wdc_parsefile(fname):
    """Load a WDC datafile and place the contents into a dataframe.

    Load a datafile of WDC hourly geomagnetic data for a single observatory and
    extract the contents. Parses the current WDC file format, but not the
    previous format containing international quiet (Q) or disturbed (D) day
    designation in place of the century field - only the newer format is
    downloaded from the BGS servers. Detailed file format description
    can be found at http://www.wdc.bgs.ac.uk/catalog/format.html

    Args:
        fname (str): path to a WDC datafile.

    Returns:
        data (pandas.DataFrame):
            dataframe containing hourly geomagnetic data. First column is a
            series of datetime objects (in the format yyyy-mm-dd hh:30:00) and
            subsequent columns are the X, Y and Z components of the magnetic
            field at the specified times.
    """
    # New WDC file format
    cols = [(0, 3), (3, 5), (5, 7), (7, 8), (8, 10), (14, 16),
            (16, 20), (20, 116)]
    col_names = [
        'code', 'yr', 'month', 'component', 'day', 'century',
        'base', 'hourly_values']
    types = {
        'code': str, 'year': int, 'month': int, 'component': str,
        'day': int, 'century': int, 'base': int, 'hourly_values': str}
    data = pd.read_fwf(fname, colspecs=cols, names=col_names,
                       converters=types, header=None)
    data['hourly_values'] = data['hourly_values'].apply(
                                                      separate_hourly_vals)
    data = data.set_index(['code', 'yr', 'month', 'component', 'day',
                           'century', 'base'])['hourly_values'].apply(
                           pd.Series).stack()
    data = data.reset_index()
    data.columns = ['code', 'yr', 'month', 'component', 'day', 'century',
                    'base', 'hour', 'hourly_mean_temp']
    data['hourly_mean_temp'] = data['hourly_mean_temp'].astype(float)

    return data


[docs]def separate_hourly_vals(hourstring):
    """Separate individual hourly field means from the string containing all
    24 values in the WDC file. Called by wdc_parsefile.

    Args:
        hourstring (str): string containing the hourly magnetic field means
            parsed from a WDC file for a single day.

    Returns:
        hourly_vals_list (list):
            list containing the hourly field values.
    """
    n = 4
    hourly_vals_list = [hourstring[i:i+n] for i in range(0, len(hourstring),
                        n)]
    return hourly_vals_list


[docs]def wdc_datetimes(data):
    """Create datetime objects from the fields extracted from a WDC datafile.

    Args:
        data (pandas.DataFrame): needs columns for century, year (yy format),
            month, day and hour. Called by wdc_parsefile.

    Returns:
        data (pandas.DataFrame):
            the same dataframe with a series of datetime objects (in the format
            yyyy-mm-dd hh:30:00) in the first column.
    """
    # Convert the century/yr columns to a year
    data['year'] = 100 * data['century'] + data['yr']

    # Create datetime objects from the century, year, month and day columns of
    # the WDC format data file. The hourly mean is given at half past the hour
    dates = data.apply(lambda x: dt.datetime.strptime(
        "{0} {1} {2} {3} {4}".format(x['year'], x['month'], x['day'],
                                     x['hour'], 30), "%Y %m %d %H %M"), axis=1)
    data.insert(0, 'date', dates)
    data.drop(['year', 'yr', 'century', 'code', 'day', 'month', 'hour'],
              axis=1, inplace=True)

    return data


[docs]def wdc_xyz(data):
    """Convert extracted WDC data to hourly X, Y and Z components in nT.

    Missing values (indicated by 9999 in the datafiles) are replaced with NaNs.

    Args:
        data (pandas.DataFrame): dataframe containing columns for datetime
            objects, magnetic field component (D, I, F, H, X, Y or Z), the
            tabular base and hourly mean.

    Returns:
        data (pandas.DataFrame):
            the same dataframe with datetime objects in the first column and
            columns for X, Y and Z components of magnetic field (in nT).
    """
    # Replace missing values with NaNs
    data.replace(9999, np.nan, inplace=True)
    # Group the data by field component, calculate the hourly means and form
    # a dataframe with separate columns for each field component
    data = data.groupby('component').apply(hourly_mean_conversion)
    data.reset_index(drop=True, inplace=True)
    data.drop(['base', 'hourly_mean_temp'], axis=1, inplace=True)
    """In older versions pd.pivot_table() kept NaNs by default, but we
    test for the NaN being present so must force them to be kept."""
    data = data.pivot_table(index='date', columns='component',
                            values='hourly_mean', dropna=False)
    data.reset_index(inplace=True)

    # Call helper function to convert D and H components to X and Y
    if 'D' in data.columns and 'H' in data.columns:
        data = angles_to_geographic(data)

    # Make sure that the dataframe contains columns for X, Y and Z components,
    # and create a column of NaN values if a component is missing
    if 'X' not in data.columns:
        data['X'] = np.NaN
    if 'Y' not in data.columns:
        data['Y'] = np.NaN
    if 'Z' not in data.columns:
        data['Z'] = np.NaN

    data = data[['date', 'X', 'Y', 'Z']]

    return data


[docs]def hourly_mean_conversion(data):
    """Use the tabular base to calculate hourly means in nT or degrees (D, I).

    Uses the tabular base and hourly value from the WDC file to calculate the
    hourly means of magnetic field components. Value is in nT for H, F, X, Y or
    Z components and in degrees for D or I components. Called by wdc_xyz.

    hourly_mean = tabular_base*100 + wdc_hourly_value (for components in nT)

    hourly_mean = tabular_base + wdc_hourly_value/600 (for D and I components)

    Args:
        data (pandas.DataFrame): dataframe containing columns for datetime
            objects, magnetic field component (D, I, F, H, X, Y or Z), the
            tabular base and hourly mean.

    Returns:
        obs_data (pandas.DataFrame):
            dataframe with datetime objects in the first column and hourly
            means of the field components in either nT or degrees (depending on
            the component).
    """
    obs_data = pd.DataFrame()
    for group in data.groupby('component'):

        if group[0] == 'D' or group[0] == 'I':
            group[1]['hourly_mean'] = group[1]['base'] + \
                (1 / 600.0) * group[1]['hourly_mean_temp']
            obs_data = obs_data.append(group[1], ignore_index=True)
        else:
            group[1]['hourly_mean'] = 100.0 * group[1]['base'] + \
                group[1]['hourly_mean_temp']
            obs_data = obs_data.append(group[1], ignore_index=True)
    return obs_data


[docs]def angles_to_geographic(data):
    """Use D and H values to calculate the X and Y field components.

    The declination (D) and horizontal intensity (H) relate to the north (Y)
    and east (X) components as follows:

    X = H*cos(D)

    Y = H*sin(D)

    Args:
        data (pandas.DataFrame): dataframe containing columns for datetime
            objects and hourly means of the magnetic field components (D, I, F,
            H, X, Y or Z).

    Returns:
        data (pandas.DataFrame):
            the same dataframe with datetime objects in the first column and
            hourly means of the field components in either nT or degrees
            (depending on the component).
    """
    data.loc[(~np.isnan(data['D']) & ~np.isnan(data['H'])), 'X'] = data.loc[(
        ~np.isnan(data['D']) & ~np.isnan(data['H'])), 'H'] * np.cos(np.deg2rad(
            data.loc[(~np.isnan(data['D']) & ~np.isnan(data['H'])), 'D']))

    data.loc[(~np.isnan(data['D']) & ~np.isnan(data['H'])), 'Y'] = data.loc[(
        ~np.isnan(data['D']) & ~np.isnan(data['H'])), 'H'] * np.sin(np.deg2rad(
            data.loc[(~np.isnan(data['D']) & ~np.isnan(data['H'])), 'D']))

    return data


[docs]def wdc_readfile(fname):
    """Wrapper function to call wdc_parsefile, wdc_datetimes and wdc_xyz.

    Args:
        fname (str): path to a WDC datafile.

    Returns:
        data (pandas.DataFrame):
            dataframe containing the data read from the WDC file. First column
            is a series of datetime objects (in the format yyyy-mm-dd hh:30:00)
            and subsequent columns are the X, Y and Z components of the
            magnetic field at the specified times (hourly means).
    """
    rawdata = wdc_parsefile(fname)
    rawdata = wdc_datetimes(rawdata)
    data = wdc_xyz(rawdata)

    return data


[docs]def append_wdc_data(*, obs_name, path=None):
    """Append all WDC data for an observatory into a single dataframe.

    Args:
        obs_name (str): observatory name (as 3-digit IAGA code).
        path (str): path to directory containing WDC datafiles. All files for
            the observatory should be located in the same directory.

    Returns:
        data (pandas.DataFrame):
            dataframe containing all available hourly geomagnetic data at a
            single observatory. First column is a series of datetime objects
            (in the format yyyy-mm-dd hh:30:00) and subsequent columns are the
            X, Y and Z components of the magnetic field at the specified times.
    """
    data = pd.DataFrame()

    data_path = os.path.join(path, obs_name.lower() + '*.wdc')
    # Obtain a list of all files containing the observatory name and ending
    # .wdc in the specified directory
    filenames = sorted(glob.glob(data_path))
    # Iterate over the files and append them to previous files
    for file in filenames:
        print(file)
        try:
            frame = wdc_readfile(file)
            data = data.append(frame, ignore_index=True)
        except StopIteration:
            pass

    return data


[docs]def covobs_parsefile(*, fname, data_type):
    """Loads MF and SV predictions from the COV-OBS geomagnetic field model.

    Load a datafile containing SV/MF predictions from the COV-OBS magnetic
    field model series by Gillet et al. (2013, Geochem. Geophys. Geosyst.,
    https://doi.org/10.1002/ggge.20041;
    2015, Earth, Planets and Space, https://doi.org/10.1186/s40623-015-0225-z)
    field model.

    Args:
        fname (str): path to a COV-OBS datafile.
        data_type (str): specify whether the file contains magnetic field data
            ('mf') or or secular variation data ('sv')

    Returns:
        model_data (pandas.DataFrame):
            dataframe containing hourly geomagnetic data. First column is a
            series of datetime objects (in the format yyyy-mm-dd) and
            subsequent columns are the X, Y and Z components of the SV/MF at
            the specified times.
    """
    model_data = pd.read_csv(fname, sep=r'\s+', header=None,
                             usecols=[0, 1, 2, 3])
    if data_type is 'mf':
        model_data.columns = ["year_decimal", "X", "Y", "Z"]
    else:
        model_data.columns = ["year_decimal", "dX", "dY", "dZ"]
    return model_data


[docs]def covobs_datetimes(data):
    """Create datetime objects from COV-OBS field model output file.

    The format output by the field model is year.decimalmonth e.g. 1960.08 is
    Jan 1960.

    Args:
        data (pandas.DataFrame): needs a column for decimal year (in yyyy.mm
            format).

    Returns:
        data (pandas.DataFrame):
            the same dataframe with the decimal year column replced with a
            series of datetime objects in the format yyyy-mm-dd.
    """
    year_temp = np.floor(data.year_decimal.values.astype(
        'float64')).astype('int')

    months = (12 * (data.year_decimal - year_temp) + 1).round().astype('int')

    data.insert(0, 'year', year_temp)
    data.insert(1, 'month', months)

    date = data.apply(lambda x: dt.datetime.strptime(
        "{0} {1}".format(int(x['year']), int(x['month'])), "%Y %m"),
        axis=1)

    data.insert(0, 'date', date)

    data.drop(['year', 'year_decimal', 'month'], axis=1, inplace=True)

    return data


[docs]def covobs_readfile(*, fname, data_type):
    """Wrapper function to call covobs_parsefile and covobs_datetimes.

    The COV-OBS code (publically available) can be used to produce synthetic
    observatory time series for other field models if the appropriate spline
    file is used. The output will be of the same format as COV-OBS output and
    can be read using MagPySV.

    Args:
        fname (str): path to a COV-OBS format datafile.
        data_type (str): specify whether the file contains magnetic field data
            ('mf') or or secular variation data ('sv')
    Returns:
        data (pandas.DataFrame):
            dataframe containing the data read from the file. First column is a
            series of datetime objects (in the format yyyy-mm-dd) and
            subsequent columns are the X, Y and Z components of the SV/MF at
            the specified times.
    """

    rawdata = covobs_parsefile(fname=fname, data_type=data_type)
    data = covobs_datetimes(rawdata)

    return data


[docs]def wdc_to_hourly_csv(*, wdc_path=None, write_dir, obs_list,
                      print_obs=True):
    """Convert WDC file to X, Y and Z hourly means and save to CSV file.

    Finds WDC hourly data files for all observatories in a directory path
    (assumes data for all observatories are located inside the same directory).
    The BGS downloading app distributes data inside a single directory
    with the naming convention obsyear.wdc where obs is a three digit
    observatory name in lowercase and year is a four digit year,
    e.g. ngk1990.wdc or clf2013.wdc. This function converts the hourly WDC
    format data to hourly X, Y and Z means, appends all years of data for a
    single observatory into a single dataframe and saves the dataframe to a
    CSV file.

    Args:
        wdc_path (str): path to the directory containing datafiles.
        write_dir (str): directory path to which the output CSV files are
            written.
        obs_list (list): list of observatory names (as 3-digit IAGA codes).
        print_obs (bool): choose whether to print each observatory name as the
            function goes through the directories. Useful for checking progress
            as it can take a while to read the whole WDC dataset. Defaults to
            True.
    """
    # Create the output directory if it does not exist
    if not os.path.exists(write_dir):
        os.makedirs(write_dir)
    # Iterate over each given observatory and call append_wdc_data
    for observatory in obs_list:
        if print_obs is True:
            print(observatory)
        wdc_data = append_wdc_data(
            obs_name=observatory,
            path=wdc_path)
        write_csv_data(data=wdc_data, write_dir=write_dir,
                       obs_name=observatory)


[docs]def write_csv_data(*, data, write_dir, obs_name, file_prefix=None,
                   decimal_dates=False, header=True):
    """Write dataframe to a CSV file.

    Args:
        data (pandas.DataFrame): data to be written to file.
        write_dir (str): directory path to which the output CSV file is
            written.
        obs_name (str): name of observatory at which the data were obtained.
        file_prefix (str): optional string to prefix the output CSV filenames
            (useful for specifying parameters used to create the dataset etc).
        decimal_dates (bool): optional argument to specify that dates should be
            written in decimal format rather than datetime objects. Defaults to
            False.
        header (bool): option to include header in file. Defaults to True.
    """

    # Create the output directory if it does not exist
    if not os.path.exists(write_dir):
        os.makedirs(write_dir)

    # Convert datetime objects to decimal dates if required
    if decimal_dates is True:
        data.date = data.date.apply(datetime_to_decimal)
    if file_prefix is not None:
        fpath = os.path.join(write_dir, file_prefix + obs_name + '.csv')
    else:
        fpath = os.path.join(write_dir, obs_name + '.csv')
    data.to_csv(fpath, sep=',', na_rep='NA', header=header, index=False)


[docs]def read_csv_data(*, fname, data_type):
    """Read dataframe from a CSV file.

    Args:
        fname (str): path to a CSV datafile.
        data_type (str): specify whether the file contains magnetic field data
            ('mf') or or secular variation data ('sv')

    Returns:
        data (pandas.DataFrame):
            dataframe containing the data read from the CSV file.
    """
    if data_type is 'mf':
        col_names = ['date', 'X', 'Y', 'Z']
    else:
        col_names = ['date', 'dX', 'dY', 'dZ']
    data = pd.read_csv(fname, sep=',', header=0, names=col_names,
                       parse_dates=[0])
    return data


[docs]def combine_csv_data(*, start_date, end_date, sampling_rate='MS',
                     obs_list, data_path, model_path, day_of_month=1):
    """Read and combine observatory and model SV data for several locations.

    Calls read_csv_data to read observatory data and field model predictions
    for each observatory in a list. The data and predictions for individual
    observatories are combined into their respective large dataframes. The
    first column contains datetime objects and subsequent columns contain X, Y
    and Z secular variation/field components (in groups of three) for all
    observatories.

    Args:
        start_date (datetime.datetime): the start date of the data analysis.
        end_date (datetime.datetime): the end date of the analysis.
        sampling_rate (str): the sampling rate for the period of interest. The
            default is 'MS', which creates a range of dates between the
            specified values at monthly intervals with the day fixed as the
            first of each month. Use 'M' for the final day of each month. Other
            useful options are 'AS' (a series of dates at annual intervals,
            with the day and month fixed at 01 and January respectively) and
            'A' (as for 'AS' but with the day/month fixed as 31 December.)
        obs_list (list): list of observatory names (as 3-digit IAGA codes).
        data_path (str): path to the CSV files containing observatory data.
        model_path (str): path to the CSV files containing model SV data.
        day_of_month (int): For SV data, first differences of
            monthly means have dates at the start of the month (i.e. MF of
            mid-Feb minus MF of mid-Jan should give SV at Feb 1st. For annual
            differences of monthly means the MF of mid-Jan year 2 minus MF of
            mid-Jan year 1 gives SV at mid-July year 1. The dates of COV-OBS
            output default to the first day of the month (compatible with dates
            of monthly first differences SV data, but not with those of
            annual differences). This option is used to set the day part of the
            dates column if required. Default to 1 (all output dataframes
            will have dates set at the first day of the month.)

    Returns:
        (tuple): tuple containing:

        - obs_data (*pandas.DataFrame*):
            dataframe containing SV data for all observatories in obs_list.
        - model_sv_data (*pandas.DataFrame*):
            dataframe containing SV predictions for all observatories in
            obs_list.
        - model_mf_data (*pandas.DataFrame*):
            dataframe containing magnetic field predictions for all
            observatories in obs_list.
    """
    # Initialise the dataframe with the appropriate date range
    dates = pd.date_range(start_date, end_date, freq=sampling_rate)
    obs_data = pd.DataFrame({'date': dates})
    model_sv_data = pd.DataFrame({'date': dates})
    model_mf_data = pd.DataFrame({'date': dates})

    for observatory in obs_list:

        obs_file = observatory + '.csv'
        model_sv_file = 'sv_' + observatory + '.dat'
        model_mf_file = 'mf_' + observatory + '.dat'
        obs_data_temp = read_csv_data(fname=os.path.join(data_path, obs_file),
                                      data_type='sv')
        model_sv_data_temp = covobs_readfile(fname=os.path.join(model_path,
                                             model_sv_file), data_type='sv')
        model_mf_data_temp = covobs_readfile(fname=os.path.join(model_path,
                                             model_mf_file), data_type='mf')

        model_sv_data_temp['date'] = model_sv_data_temp['date'].apply(
            lambda dt: dt.replace(day=1))

        obs_data_temp.rename(
            columns={'dX': 'dX' + '_' + observatory,
                     'dY': 'dY' + '_' + observatory,
                     'dZ': 'dZ' + '_' + observatory}, inplace=True)
        obs_data_temp['date'] = obs_data_temp['date'].apply(
            lambda dt: dt.replace(day=1))
        model_sv_data_temp.rename(
            columns={'dX': 'dX' + '_' + observatory,
                     'dY': 'dY' + '_' + observatory,
                     'dZ': 'dZ' + '_' + observatory}, inplace=True)
        model_mf_data_temp.rename(
            columns={'X': 'X' + '_' + observatory,
                     'Y': 'Y' + '_' + observatory,
                     'Z': 'Z' + '_' + observatory}, inplace=True)
        # Combine the current observatory data with those of other
        # observatories
        if observatory == obs_list[0]:
            obs_data = pd.merge(
                left=obs_data, right=obs_data_temp,
                how='left', on='date')
            model_sv_data = pd.merge(
                left=model_sv_data, right=model_sv_data_temp,
                how='left', on='date')
            model_mf_data = pd.merge(
                left=model_mf_data, right=model_mf_data_temp,
                how='left', on='date')

        else:
            obs_data = pd.merge(
                left=obs_data, right=obs_data_temp,
                how='left', on='date')
            model_sv_data = pd.merge(
                left=model_sv_data, right=model_sv_data_temp,
                how='left', on='date')
            model_mf_data = pd.merge(
                left=model_mf_data, right=model_mf_data_temp,
                how='left', on='date')
    if day_of_month is not 1:
        model_sv_data['date'] = model_sv_data['date'].apply(
            lambda dt: dt.replace(day=day_of_month))
        model_mf_data['date'] = model_sv_data['date']
        obs_data['date'] = model_sv_data['date']
    return obs_data, model_sv_data, model_mf_data


[docs]def datetime_to_decimal(date):
    """Convert a datetime object to a decimal year.

    Args:
        date (datetime.datetime): datetime object representing an observation
            time.

    Returns:
        date (float): the same date represented as a decimal year.
    """
    year_start = dt.datetime(date.year, 1, 1)
    year_end = year_start.replace(year=date.year + 1)
    decimal_year = date.year + (date - year_start) / (year_end - year_start)
    return decimal_year


[docs]def ae_parsefile(fname):
    """Load a WDC-like format AE file and place contents into a dataframe.

    Load a file of AE (Auroral Electroject)
    index hourly data in the format distributed by the Kyoto WDC at
    http://wdc.kugi.kyoto-u.ac.jp/dstae/index.html and extract the contents.

    Args:
        fname (str): path to a WDC-like formatted AE file.

    Returns:
        data (pandas.DataFrame):
            dataframe containing hourly AE data. First column is a
            series of datetime objects (in the format yyyy-mm-dd hh:30:00) and
            second column contains theAE values at the specified times.
    """
    # AE WDC file format
    cols = [(0, 2), (3, 5), (5, 7), (8, 10), (14, 16),
            (16, 20), (20, 116)]
    col_names = [
        'code', 'yr', 'month', 'day', 'century',
        'base', 'hourly_values']
    types = {
        'code': str, 'year': int, 'month': int,
        'day': int, 'century': int, 'base': int, 'hourly_values': str}
    data = pd.read_fwf(fname, colspecs=cols, names=col_names,
                       converters=types, header=None)
    data = data.loc[data['code'] == "AE"]
    # Separate the hourly values
    try:
        data['hourly_values'] = data['hourly_values'].apply(
                                                    separate_hourly_vals)
    except ValueError:
        data['hourly_values'] = data['hourly_values'].apply(
            separate_hourly_vals_ae)
    data = data.set_index(['code', 'yr', 'month', 'day',
                           'century', 'base'])['hourly_values'].apply(
                           pd.Series).stack()
    data = data.reset_index()
    data.columns = ['code', 'yr', 'month', 'day', 'century',
                    'base', 'hour', 'hourly_mean_temp']
    data['hourly_mean_temp'] = data['hourly_mean_temp'].astype(float)
    return data


[docs]def separate_hourly_vals_ae(hourstring):
    """Separate individual hourly field means from the string containing all
    24 values in the AE file. Called by ae_parsefile.

    Args:
        hourstring (str): string containing the hourly AE means parsed from
            a Kyoto WDC-like file for a single day.

    Returns:
        hourly_vals_list (list):
            list containing the hourly AE values.
    """
    n = 4
    if hourstring[0] is not '-' and hourstring[0] is not ' ':
        hourstring = ' ' + hourstring
    hourly_vals_list = [hourstring[i:i+n] for i in range(0, len(hourstring),
                        n)]
    return hourly_vals_list


[docs]def ae_readfile(fname):
    """Wrapper function to call ae_parsefile and wdc_datetimes.

    Args:
        fname (str): path to a AE file in Kyoto WDC-like format. Assumes data
            for all years are contained within this file.

    Returns:
        data (pandas.DataFrame):
            dataframe containing the data read from the WDC file. First column
            is a series of datetime objects (in the format yyyy-mm-dd hh:30:00)
            and second column contains AE values at the specified times
            (hourly means).
    """
    data = ae_parsefile(fname)
    data = wdc_datetimes(data)
    data['hourly_mean'] = 100.0 * data['base'] + \
        data['hourly_mean_temp']
    data.drop(['hourly_mean_temp', 'base'], axis=1, inplace=True)
    return data


[docs]def append_ae_data(ae_data_path):
    """Append AE data into a single dataframe containing all years.

    Data downloaded from
    ftp://ftp.ngdc.noaa.gov/STP/GEOMAGNETIC_DATA/INDICES/AURORAL_ELECTROJET/HOURLY/
    come in WDC-like format files with one file per year named aeyyyy.wdc (data
    provided by the WDC at Kyoto. Can be downloaded directly from
    http://wdc.kugi.kyoto-u.ac.jp/dstae/index.html)

    Args:
        ae_data_path (str): path to directory containing WDC-like format AE
            datafiles. All AE files should be located in the same directory.

    Returns:
        data (pandas.DataFrame):
            dataframe containing all available hourly AE data. First column is
            a series of datetime objects (in the format yyyy-mm-dd hh:30:00)
            and second column contains AE values at the specified times.
    """
    data = pd.DataFrame()
    # Obtain a list of all files containing 'ae' and ending in .wdc in the
    # specified directory
    filenames = sorted(glob.glob(ae_data_path + 'ae*.txt'))
    # Iterate over the files and append them to previous files
    for file in filenames:
        print(file)
        try:
            frame = ae_readfile(file)
            data = data.append(frame, ignore_index=True)
        except StopIteration:
            pass

    return data


[docs]def ap_readfile(fname):
    """Load an kp/ap file and place the hourly ap values into a dataframe.

    Load a datafile of 3-hourly ap data and extract the contents. Each of the
    3-hourly values for a given day is repeated three times to give an hourly
    mean for all 24 hours of the day. This function is designed to read files
    downloaded from the GFZ, Potsdam server at
    ftp://ftp.gfz-potsdam.de/pub/home/obs/kp-ap/.

    Args:
        fname (str): path to an ap datafile.

    Returns:
        data (pandas.DataFrame):
            dataframe containing hourly ap data. First column is a
            series of datetime objects (in the format yyyy-mm-dd hh:30:00) and
            second column contains ap values at the specified times.
    """
    col_names = ['full_string']
    types = {'full_string': str}
    # Parse the file
    if fname[-8] == '2':
        cols = [(1, 55)]
        data = pd.read_fwf(fname, colspecs=cols, names=col_names,
                           converters=types, header=None)
        data['month'] = data.full_string.str[1:3]
        data['day'] = data.full_string.str[3:5]
        data['hourly_values'] = data.full_string.str[30:]
    else:
        cols = [(0, 55)]
        data = pd.read_fwf(fname, colspecs=cols, names=col_names,
                           converters=types, header=None)
        data['month'] = data.full_string.str[2:4]
        data['day'] = data.full_string.str[4:6]
        data['hourly_values'] = data.full_string.str[32:]
    data.drop(['full_string'], axis=1, inplace=True)
    data['hourly_values'] = data['hourly_values'].apply(
        separate_three_hourly_vals)
    data = data.set_index(['month', 'day'])['hourly_values'].apply(
                               pd.Series).stack()
    data = data.reset_index()
    data.columns = ['month', 'day', 'hour', 'hourly_mean']
    data['hourly_mean'] = data['hourly_mean'].astype(float)
    data['year'] = int(fname[-8:-4])
    dates = data.apply(lambda x: dt.datetime.strptime(
        "{0} {1} {2} {3} {4}".format(x['year'], x['month'], x['day'],
                                     x['hour'], 30), "%Y %m %d %H %M"), axis=1)
    data.insert(0, 'date', dates)
    data.drop(['year', 'day', 'month', 'hour'],
              axis=1, inplace=True)
    return data


[docs]def separate_three_hourly_vals(hourstring):
    """Separate 3-hourly ap means from the string containing all 8 values.

    Separate the 8 individual 3-hourly ap means from the string containing all
    values for the day. Each value is repeated 3 times to give a value for each
    hour. Called by ap_readfile.

    Args:
        hourstring (str): string containing the 3-hourly ap means parsed from
            a Kyoto WDC-like file for a single day.

    Returns:
        hourly_vals_list (list):
            list containing the hourly ap values.
    """
    n = 3
    hourly_vals_list = [hourstring[i:i+n] for i in range(0, len(hourstring),
                        n)]
    hourly_vals_list = np.repeat(hourly_vals_list, n)
    return hourly_vals_list


[docs]def append_ap_data(ap_data_path):
    """Append ap data into a single dataframe containing all years.

    Data downloaded from ftp://ftp.gfz-potsdam.de/pub/home/obs/kp-ap/wdc/
    come in WDC-like format files with one file per year named kpyyyy.wdc. This
    function concatenates all years into a single dataframe.

    Args:
        ap_data_path (str): path to directory containing WDC-like format ap
            datafiles. All ap files should be located in the same directory.

    Returns:
        data (pandas.DataFrame):
            dataframe containing all available hourly ap data. First column is
            a series of datetime objects (in the format yyyy-mm-dd hh:30:00)
            and second column contains ap values at the specified times.
    """
    data = pd.DataFrame()
    # Obtain a list of all files containing 'ap' and ending in .wdc in the
    # specified directory
    filenames = sorted(glob.glob(ap_data_path + 'kp*.wdc'))
    # Iterate over all files and append data
    for file in filenames:
        print(file)
        try:
            frame = ap_readfile(file)
            data = data.append(frame, ignore_index=True)
        except StopIteration:
            pass

    return data