-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor: Time-series-related functions
- Loading branch information
Showing
1 changed file
with
329 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,329 @@ | ||
import numpy as np | ||
from scipy.stats import binned_statistic | ||
|
||
|
||
def arrivial_rate(arr_times, dt, start_hour=-1, start_by="pad"): | ||
""" | ||
Compute the arrivial rate by counting the number of arrivial in (t-dt, t). | ||
Return the computed rate along with an array containing the | ||
corresponding time values. | ||
The start of the time array depends on the first arrivial time, | ||
specifying start_hour != -1 force the time axis start at the wanted hour. | ||
start_by specify the method for offseting the time axis. | ||
pad: Pad zeros up to the start point, so we don't lose data | ||
cut: Cut data down to the start point, so we don't add extra zeros | ||
(!) When UTC timestamps are inputted for arr_times, the resulting | ||
counts will also be in UTC. | ||
""" | ||
|
||
arr_times = np.sort(arr_times) | ||
first = arr_times[0] | ||
|
||
# Compute the first point of the time axis | ||
if start_hour == -1: | ||
t0 = first | ||
else: | ||
first_sec = first % 86400 | ||
start_sec = 3600*start_hour | ||
delta = first_sec - start_sec | ||
|
||
if start_by == "pad": | ||
if delta >= 0: | ||
t0 = first - delta | ||
else: | ||
t0 = first - 86400 - delta | ||
|
||
elif start_by == "cut": | ||
if delta <= 0: | ||
t0 = first - delta | ||
else: | ||
t0 = first + 86400 - delta | ||
|
||
# The first valid time point is t0 + dt because we have no data before t0 | ||
t_axis = np.arange(t0 + dt, arr_times[-1] + dt, dt) | ||
|
||
# Compute the arrivial count by histogram | ||
bins = np.insert(t_axis, 0, t0) | ||
count, _ = np.histogram(arr_times, bins) | ||
|
||
return count, t_axis | ||
|
||
|
||
def moving_avg(x, t, dt, start_hour=-1, start_by="pad"): | ||
""" | ||
Compute the moving average of x(t) within (t-dt, t). | ||
Return the computed average along with an array containing the | ||
corresponding time values. | ||
The start of the time array depends on the first time point. | ||
Specifying start_hour != -1 force the time axis start at the wanted hour. | ||
Specifying start_hour == -1 will simply start from the first recorded time | ||
start_by specify the method for offseting the time axis. | ||
pad: Pad zeros up to the start point, so we don't lose data | ||
cut: Cut data down to the start point, so we don't add extra zeros | ||
(!) When UTC timestamps are inputted for arr_times, the resulting | ||
counts will also be in UTC. | ||
""" | ||
|
||
# 1 - Sort the input data in timed order | ||
ind = np.argsort(t) | ||
t = t[ind] | ||
x = x[ind] | ||
first = t[0] | ||
|
||
# 2 - Compute the first point of the time axis | ||
if start_hour == -1: | ||
t0 = first | ||
else: | ||
first_sec = first % 86400 | ||
start_sec = 3600 * start_hour | ||
delta = first_sec - start_sec | ||
|
||
if start_by == "pad": | ||
if delta >= 0: | ||
t0 = first - delta | ||
else: | ||
t0 = first - 86400 - delta | ||
|
||
elif start_by == "cut": | ||
if delta <= 0: | ||
t0 = first - delta | ||
else: | ||
t0 = first + 86400 - delta | ||
|
||
# The first valid time point is t0 + dt because we have no data before t0 | ||
t_axis = np.arange(t0 + dt, t[-1] + dt, dt) | ||
|
||
# Compute the arrivial count by histogram | ||
bins = np.insert(t_axis, 0, t0) | ||
avg, _, _ = binned_statistic(t, x, statistic="mean", bins=bins) | ||
|
||
return avg, t_axis | ||
|
||
|
||
def fold_timeseries(timeseries, dt, foldsize): | ||
""" | ||
Fold a timeseries onto a given time-window size | ||
""" | ||
|
||
n = round(foldsize / dt) | ||
total_len = int(len(timeseries) / n) | ||
|
||
folded = [timeseries[k*n:(k+1)*n] for k in range(total_len)] | ||
return folded | ||
|
||
|
||
def nullify_day(timeseries, dt): | ||
""" | ||
Replace value by nan if the whole day contains no data | ||
""" | ||
|
||
# Number of point per day | ||
n = 86400 // dt | ||
|
||
# Change data type to float to allow nan | ||
timeseries = timeseries.astype(np.float64) | ||
|
||
# Fold data into daily winodw | ||
folded = fold_timeseries(timeseries, dt, foldsize=86400) | ||
for i, window in enumerate(folded): | ||
|
||
# If there is not a single flight in a day, | ||
# replace all zeros to nan | ||
if np.sum(window) == 0: | ||
print("Found empty day!") | ||
for j in range(-n, 2*n): | ||
timeseries[i*n + j] = np.nan | ||
|
||
return timeseries | ||
|
||
|
||
def autocorr(timeseries, dt, lag_max): | ||
""" | ||
Compute the autocorrelation coefficient for a stationary time series. | ||
Up to lag_max of time lag. | ||
""" | ||
|
||
if lag_max > (dt * len(timeseries)): | ||
lag_max = (dt * len(timeseries)) | ||
|
||
n_max = lag_max // dt | ||
a = np.zeros(n_max + 1) | ||
mean = np.nanmean(timeseries) | ||
var = np.nanvar(timeseries) | ||
|
||
for n in range(n_max + 1): | ||
sample = [timeseries[i]*timeseries[i+n] | ||
for i in range(len(timeseries) - n)] | ||
|
||
a[n] = np.nanmean(sample) - mean*mean | ||
a[n] /= var | ||
|
||
return a | ||
|
||
|
||
def autocorr_period(timeseries, dt, period): | ||
""" | ||
Compute the autocorrelation coefficient for a periodic time series | ||
""" | ||
|
||
n = round(period / dt) | ||
n_period = int(len(timeseries) / n) | ||
a_matrix = np.zeros((n, n)) | ||
|
||
mean = np.zeros(n) | ||
var = np.zeros(n) | ||
for i in range(n): | ||
sample = [timeseries[m*n + i] for m in range(n_period-1)] | ||
mean[i] = np.nanmean(sample) | ||
var[i] = np.nanvar(sample) | ||
|
||
for i in range(n): | ||
for j in range(n): | ||
sample = [timeseries[m*n + i] * timeseries[m*n + i + j] | ||
for m in range(n_period-1)] | ||
|
||
j_mod = (i+j) % n | ||
a_matrix[i, j] = np.nanmean(sample) - mean[i]*mean[j_mod] | ||
a_matrix[i, j] /= np.sqrt(var[i] * var[j_mod]) | ||
|
||
return a_matrix | ||
|
||
|
||
def autocovar_period(timeseries, dt, period, cumulant=True): | ||
""" | ||
Compute the autocorrelation function for a periodic time series. | ||
This is also called the autocovariance. | ||
When cumulant is True, compute <X(t)X(t+T)> - <X(t)><X(t+T)>, | ||
cumulant is False, compute <X(t)X(t+T)> | ||
""" | ||
|
||
n = round(period / dt) | ||
n_period = int(len(timeseries) / n) | ||
a_matrix = np.zeros((n, n)) | ||
|
||
if cumulant: | ||
mean = np.zeros(n) | ||
for i in range(n): | ||
sample = [timeseries[m*n + i] for m in range(n_period-1)] | ||
mean[i] = np.nanmean(sample) | ||
|
||
for i in range(n): | ||
for j in range(n): | ||
sample = [timeseries[m*n + i] * timeseries[m*n + i + j] | ||
for m in range(n_period-1)] | ||
|
||
a_matrix[i, j] = np.nanmean(sample) | ||
|
||
if cumulant: | ||
a_matrix[i, j] -= mean[i]*mean[(i+j) % n] | ||
|
||
return a_matrix | ||
|
||
|
||
def autocorr_period_ext(timeseries, dt, period, n_period): | ||
""" | ||
Compute the autocorrelation coefficient for a periodic time series; | ||
extended for n_period of periods | ||
""" | ||
|
||
# If doing for just one period | ||
if n_period == 1: | ||
return autocorr_period(timeseries, dt, period) | ||
|
||
# Number of time steps in a period and in n periods | ||
ts_1 = period // dt | ||
ts_n = (period * n_period) // dt | ||
|
||
# Number of periods that can fit in the time series data | ||
num_period = len(timeseries) // ts_1 | ||
|
||
a_matrix = np.zeros((ts_1, ts_n)) | ||
|
||
mean = np.zeros(ts_1) | ||
var = np.zeros(ts_1) | ||
for i in range(ts_1): | ||
sample = [timeseries[m*ts_1 + i] for m in range(num_period-1)] | ||
mean[i] = np.nanmean(sample) | ||
var[i] = np.nanvar(sample) | ||
|
||
for i in range(ts_1): | ||
for j in range(ts_n): | ||
sample = [timeseries[m*ts_1 + i] * timeseries[m*ts_1 + i + j] | ||
for m in range(num_period-n_period)] | ||
|
||
j_mod = (i+j) % ts_1 | ||
a_matrix[i, j] = np.nanmean(sample) - mean[i]*mean[j_mod] | ||
a_matrix[i, j] /= np.sqrt(var[i] * var[j_mod]) | ||
|
||
return a_matrix | ||
|
||
|
||
def seasonal_difference(timeseries, dt, period): | ||
""" | ||
Remove additive seasonal trend by seaonsal differencing, | ||
i.e. X(t) -> X(t) - X(t - period) | ||
The new time series will have one less period in length | ||
""" | ||
|
||
n = period // dt | ||
diff = [timeseries[i] - timeseries[i - n] | ||
for i in range(n, len(timeseries))] | ||
|
||
return diff | ||
|
||
|
||
def seasonal_quotient(timeseries, dt, period): | ||
""" | ||
Remove multiplicative seasonal trend by seaonsal quotienting, | ||
i.e. X(t) -> X(t)/X(t - period) | ||
The new time series will have one less period in length | ||
""" | ||
|
||
n = period // dt | ||
diff = [timeseries[i] / timeseries[i - n] | ||
for i in range(n, len(timeseries))] | ||
|
||
return diff | ||
|
||
|
||
def seasonal_mean(timeseries, dt, period): | ||
""" | ||
Compute the seasonal mean | ||
""" | ||
|
||
n = period // dt | ||
n_period = len(timeseries) // n | ||
|
||
mean = np.zeros(n) | ||
for i in range(n): | ||
sample = [timeseries[m*n + i] for m in range(n_period-1)] | ||
mean[i] = np.mean(sample) | ||
|
||
return mean | ||
|
||
|
||
def remove_seasonal_mean(timeseries, dt, period): | ||
""" | ||
Remove seasonal trend by substracting the seasonal mean | ||
""" | ||
|
||
n = period // dt | ||
|
||
mean = seasonal_mean(timeseries, dt, period) | ||
|
||
# Substract the seasonal mean elementwise to avoid edge effects | ||
detrended = np.zeros(len(timeseries)) | ||
for i in range(len(detrended)): | ||
i_trend = i % n | ||
detrended[i] = timeseries[i] - mean[i_trend] | ||
|
||
return detrended |