-
Notifications
You must be signed in to change notification settings - Fork 42
ETL APIs for handling Time Series #366
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: adatao
Are you sure you want to change the base?
Changes from all commits
4897a27
d5bd977
5bed828
1dc3203
966c6b6
fe70d3e
42c4e90
8c241bf
b120df6
41fff42
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,127 @@ | ||
| package io.ddf.etl; | ||
|
|
||
|
|
||
| import io.ddf.DDF; | ||
| import io.ddf.analytics.ABinningHandler.BinningType; | ||
| import io.ddf.exception.DDFException; | ||
| import io.ddf.misc.ADDFFunctionalGroupHandler; | ||
| import java.util.List; | ||
| import java.util.concurrent.TimeUnit; | ||
| import com.google.common.collect.Lists; | ||
|
|
||
| public abstract class ATimeSeriesHandler extends ADDFFunctionalGroupHandler implements IHandleTimeSeries { | ||
|
|
||
| protected String mTimestampColumn; | ||
| protected String mTsIDColumn = null; | ||
|
|
||
|
|
||
| public ATimeSeriesHandler(DDF theDDF) { | ||
| super(theDDF); | ||
|
|
||
| } | ||
|
|
||
| public void setTimeStampColumn(String colName) { | ||
| mTimestampColumn = colName; | ||
| } | ||
|
|
||
| public String getTimeStampColumn() { | ||
| return mTimestampColumn; | ||
| } | ||
|
|
||
|
|
||
| public String getTsIDColumn() { | ||
| return mTsIDColumn; | ||
| } | ||
|
|
||
| public void setTsIDColumn(String colName) { | ||
| this.mTsIDColumn = colName; | ||
| } | ||
|
|
||
| @Override | ||
| public DDF downsample(String timestampColumn, List<String> aggregateFunctions, int interval, TimeUnit timeUnit) | ||
| throws DDFException { | ||
|
|
||
| this.mTimestampColumn = timestampColumn; | ||
| List<String> groupByCols = Lists.newArrayList(timestampColumn); | ||
| if (mTsIDColumn != null && !mTsIDColumn.isEmpty()) { | ||
| groupByCols.add(mTsIDColumn); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. downsampling should be for each mTsIDColumn
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is mTsIDColumn?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ID of a time series |
||
| } | ||
|
|
||
| long intervalInSeconds = timeUnit.toSeconds(interval); | ||
|
|
||
| int numBins = getNumBins(intervalInSeconds); | ||
| DDF binnedDDF = this.getDDF().binning(timestampColumn, BinningType.EQUALINTERVAL.toString(), numBins, null, false, | ||
| true, true); | ||
| DDF newDDF = binnedDDF.groupBy(groupByCols, aggregateFunctions); | ||
|
|
||
| return newDDF; | ||
| } | ||
|
|
||
| @Override | ||
| public DDF downsample(String timestampColumn, String tsIDColumn, List<String> aggregateFunctions, int interval, | ||
| TimeUnit timeUnit) throws DDFException { | ||
|
|
||
| this.mTsIDColumn = tsIDColumn; | ||
| List<String> rs = getDistinctValues(tsIDColumn); | ||
|
|
||
| DDF ddf0 = filterByValue(tsIDColumn, rs.get(0)); | ||
|
|
||
| ddf0.getTimeSeriesHandler().setTsIDColumn(tsIDColumn); | ||
| DDF newDDF = ddf0.getTimeSeriesHandler().downsample(timestampColumn, aggregateFunctions, interval, timeUnit); | ||
| if (rs.size() > 1) { | ||
| for (int i = 1; i < rs.size(); i++) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There 1 key for most usecases for now. I wanted to quickly reuse DDF binning method here. Other KPIs like col diff or moving average used SparkDF window functions. |
||
| DDF filteredDDF = filterByValue(tsIDColumn, rs.get(i)); | ||
| filteredDDF.getTimeSeriesHandler().setTsIDColumn(tsIDColumn); | ||
| DDF nextDDF = filteredDDF.getTimeSeriesHandler().downsample(timestampColumn, aggregateFunctions, interval, | ||
| timeUnit); | ||
| newDDF = newDDF.getJoinsHandler().merge(nextDDF); | ||
| } | ||
| } | ||
| return newDDF; | ||
| } | ||
|
|
||
| @Override | ||
| public DDF addDiffColumn(String timestampColumn, String colToGetDiff, String diffColumn) throws DDFException{ | ||
| return addDiffColumn(timestampColumn, null, colToGetDiff, diffColumn); | ||
| } | ||
|
|
||
| @Override | ||
| public DDF addDiffColumn(String timestampColumn, String tsIDColumn, String colToGetDiff, String diffColumn) | ||
| throws DDFException { | ||
| // TODO Auto-generated method stub | ||
| return null; | ||
| } | ||
|
|
||
| @Override | ||
| public DDF computeMovingAverage(String timestampColumn, String tsIDColumn, String colToComputeMovingAverage, | ||
| String movingAverageColName, int windowSize) throws DDFException { | ||
| // TODO Auto-generated method stub | ||
| return null; | ||
| } | ||
|
|
||
| @Override | ||
| public void saveTimeSeriesToCSV(String pathToStorage) { | ||
| // TODO Auto-generated method stub | ||
|
|
||
| } | ||
|
|
||
| private int getNumBins(long intervalInSeconds) throws DDFException { | ||
| long minTimeStamp = this.getDDF().getVectorMin(mTimestampColumn).longValue(); | ||
| long maxTimeStamp = this.getDDF().getVectorMax(mTimestampColumn).longValue(); | ||
| int numBins = (int) ((maxTimeStamp - minTimeStamp) / intervalInSeconds); | ||
| return numBins; | ||
|
|
||
| } | ||
|
|
||
| private List<String> getDistinctValues(String colName) throws DDFException { | ||
| String sqlCmd = String.format("SELECT distinct(%s) FROM %s", colName, this.getDDF().getTableName()); | ||
| List<String> rs = this.getManager().sql(sqlCmd, this.getEngine()).getRows(); | ||
| return rs; | ||
| } | ||
|
|
||
| private DDF filterByValue(String colName, String value) throws DDFException { | ||
| String sqlCmd = String.format("SELECT * FROM %s WHERE %s = '%s'", this.getDDF().getTableName(), colName, value); | ||
| DDF filteredDDF = this.getDDF().getSqlHandler().sql2ddf(sqlCmd); | ||
| return filteredDDF; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| package io.ddf.etl; | ||
|
|
||
| import java.util.List; | ||
| import io.ddf.DDF; | ||
| import io.ddf.exception.DDFException; | ||
| import io.ddf.misc.IHandleDDFFunctionalGroup; | ||
| import java.util.concurrent.TimeUnit; | ||
|
|
||
| public interface IHandleTimeSeries extends IHandleDDFFunctionalGroup { | ||
|
|
||
| void setTimeStampColumn(String colName); | ||
|
|
||
| void setTsIDColumn(String colName); | ||
|
|
||
| String getTimeStampColumn() throws DDFException; | ||
|
|
||
| DDF downsample(String timestampColumn, List<String> aggregateFunctions, int interval, TimeUnit timeUnit) throws DDFException; | ||
|
|
||
| DDF downsample(String timestampColumn, String tsIDColumn, List<String> aggregateFunctions, int interval, TimeUnit timeUnit) throws DDFException; | ||
|
|
||
| DDF addDiffColumn(String timestampColumn, String colToGetDiff, String diffColName) throws DDFException; | ||
|
|
||
| DDF addDiffColumn(String timestampColumn, String tsIDColumn, String colToGetDiff, String diffColName) throws DDFException; | ||
|
|
||
| DDF computeMovingAverage(String timestampColumn, String tsIDColumn, String colToComputeMovingAverage, String movingAverageColName, | ||
| int windowSize) throws DDFException; | ||
|
|
||
| void saveTimeSeriesToCSV(String path); | ||
| } |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we need to keep mTimestamColumn and mTsIDColumn as instance variable, they already passed in by downsample function right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I planned to use them in save_ts method, you can modify that when working on save_ts impl