1616 *
1717 */
1818
19+ use crate :: event:: USER_AGENT_KEY ;
20+ use crate :: event:: format:: EventFormat ;
1921use crate :: event:: format:: LogSource ;
2022use crate :: event:: format:: LogSourceEntry ;
23+ use crate :: event:: format:: json;
2124use crate :: handlers:: TelemetryType ;
2225use crate :: handlers:: http:: ingest:: PostError ;
23- use crate :: handlers :: http :: modal :: utils :: ingest_utils :: flatten_and_push_logs ;
26+ use crate :: metadata :: SchemaVersion ;
2427use crate :: parseable:: PARSEABLE ;
2528use crate :: query:: QUERY_SESSION_STATE ;
2629use crate :: storage:: ObjectStorageError ;
2730use crate :: storage:: StreamType ;
28- use crate :: utils:: DATASET_STATS_STREAM_NAME ;
31+ use crate :: utils:: json :: apply_generic_flattening_for_partition ;
2932use arrow_array:: Array ;
3033use arrow_array:: BinaryArray ;
3134use arrow_array:: BinaryViewArray ;
@@ -39,9 +42,13 @@ use arrow_array::TimestampMillisecondArray;
3942use arrow_schema:: DataType ;
4043use arrow_schema:: Schema ;
4144use arrow_schema:: TimeUnit ;
45+ use chrono:: DateTime ;
46+ use chrono:: NaiveDateTime ;
47+ use chrono:: Utc ;
4248use datafusion:: prelude:: ParquetReadOptions ;
4349use datafusion:: prelude:: SessionContext ;
4450use futures:: StreamExt ;
51+ use regex:: Regex ;
4552use serde:: Serialize ;
4653use std:: collections:: HashMap ;
4754use std:: collections:: HashSet ;
@@ -51,6 +58,8 @@ use tracing::trace;
5158use tracing:: warn;
5259use ulid:: Ulid ;
5360
61+ pub const DATASET_STATS_STREAM_NAME : & str = "pstats" ;
62+ const DATASET_STATS_CUSTOM_PARTITION : & str = "dataset_name" ;
5463const MAX_CONCURRENT_FIELD_STATS : usize = 10 ;
5564
5665#[ derive( Serialize , Debug ) ]
@@ -82,6 +91,13 @@ pub async fn calculate_field_stats(
8291 schema : & Schema ,
8392 max_field_statistics : usize ,
8493) -> Result < bool , PostError > {
94+ //create datetime from timestamp present in parquet path
95+ let parquet_ts = extract_datetime_from_parquet_path_regex ( parquet_path) . map_err ( |e| {
96+ PostError :: Invalid ( anyhow:: anyhow!(
97+ "Failed to extract datetime from parquet path: {}" ,
98+ e
99+ ) )
100+ } ) ?;
85101 let field_stats = {
86102 let ctx = SessionContext :: new_with_state ( QUERY_SESSION_STATE . clone ( ) ) ;
87103 let table_name = Ulid :: new ( ) . to_string ( ) ;
@@ -113,19 +129,41 @@ pub async fn calculate_field_stats(
113129 . create_stream_if_not_exists (
114130 DATASET_STATS_STREAM_NAME ,
115131 StreamType :: Internal ,
116- Some ( & "dataset_name" . into ( ) ) ,
132+ Some ( & DATASET_STATS_CUSTOM_PARTITION . to_string ( ) ) ,
117133 vec ! [ log_source_entry] ,
118134 TelemetryType :: Logs ,
119135 )
120136 . await ?;
121- flatten_and_push_logs (
137+ let vec_json = apply_generic_flattening_for_partition (
122138 stats_value,
123- DATASET_STATS_STREAM_NAME ,
124- & LogSource :: Json ,
125- & HashMap :: new ( ) ,
126139 None ,
127- )
128- . await ?;
140+ None ,
141+ Some ( & DATASET_STATS_CUSTOM_PARTITION . to_string ( ) ) ,
142+ ) ?;
143+ let mut p_custom_fields = HashMap :: new ( ) ;
144+ p_custom_fields. insert ( USER_AGENT_KEY . to_string ( ) , "parseable" . to_string ( ) ) ;
145+ for json in vec_json {
146+ let origin_size = serde_json:: to_vec ( & json) . unwrap ( ) . len ( ) as u64 ; // string length need not be the same as byte length
147+ let schema = PARSEABLE
148+ . get_stream ( DATASET_STATS_STREAM_NAME ) ?
149+ . get_schema_raw ( ) ;
150+ json:: Event {
151+ json,
152+ p_timestamp : parquet_ts,
153+ }
154+ . into_event (
155+ DATASET_STATS_STREAM_NAME . to_string ( ) ,
156+ origin_size,
157+ & schema,
158+ false ,
159+ Some ( & DATASET_STATS_CUSTOM_PARTITION . to_string ( ) ) ,
160+ None ,
161+ SchemaVersion :: V1 ,
162+ StreamType :: Internal ,
163+ & p_custom_fields,
164+ ) ?
165+ . process ( ) ?;
166+ }
129167 Ok ( stats_calculated)
130168}
131169
@@ -388,6 +426,35 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
388426 }
389427}
390428
429+ fn extract_datetime_from_parquet_path_regex (
430+ parquet_path : & Path ,
431+ ) -> Result < DateTime < Utc > , Box < dyn std:: error:: Error > > {
432+ let filename = parquet_path
433+ . file_name ( )
434+ . and_then ( |name| name. to_str ( ) )
435+ . ok_or ( "Invalid filename" ) ?;
436+
437+ // Regex to match date=YYYY-MM-DD.hour=HH.minute=MM pattern
438+ let re = Regex :: new ( r"date=(\d{4}-\d{2}-\d{2})\.hour=(\d{1,2})\.minute=(\d{1,2})" ) ?;
439+
440+ if let Some ( captures) = re. captures ( filename) {
441+ let date = & captures[ 1 ] ;
442+ let hour = & captures[ 2 ] ;
443+ let minute = & captures[ 3 ] ;
444+
445+ // Create datetime string
446+ let datetime_str = format ! ( "{} {}:{}:00" , date, hour, minute) ;
447+
448+ // Parse the datetime
449+ let naive_dt = NaiveDateTime :: parse_from_str ( & datetime_str, "%Y-%m-%d %H:%M:%S" ) ?;
450+ let datetime = DateTime :: < Utc > :: from_naive_utc_and_offset ( naive_dt, Utc ) ;
451+
452+ Ok ( datetime)
453+ } else {
454+ Err ( "Could not parse datetime from filename" . into ( ) )
455+ }
456+ }
457+
391458#[ cfg( test) ]
392459mod tests {
393460 use std:: { fs:: OpenOptions , sync:: Arc } ;
0 commit comments