@@ -10,6 +10,9 @@ import { processPolledWebhookEvent } from '@/lib/webhooks/processor'
1010
1111const MAX_ROWS_PER_POLL = 100
1212
13+ /** Maximum number of leading rows to scan when auto-detecting the header row. */
14+ const HEADER_SCAN_ROWS = 10
15+
1316type ValueRenderOption = 'FORMATTED_VALUE' | 'UNFORMATTED_VALUE' | 'FORMULA'
1417type DateTimeRenderOption = 'SERIAL_NUMBER' | 'FORMATTED_STRING'
1518
@@ -20,7 +23,8 @@ interface GoogleSheetsWebhookConfig {
2023 manualSheetName ?: string
2124 valueRenderOption ?: ValueRenderOption
2225 dateTimeRenderOption ?: DateTimeRenderOption
23- lastKnownRowCount ?: number
26+ /** 1-indexed row number of the last row seeded or processed. */
27+ lastIndexChecked ?: number
2428 lastModifiedTime ?: string
2529 lastCheckedTimestamp ?: string
2630 maxRowsPerPoll ?: number
@@ -63,7 +67,6 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
6367 return 'failure'
6468 }
6569
66- // Pre-check: use Drive API to see if the file was modified since last poll
6770 const { unchanged : skipPoll , currentModifiedTime } = await isDriveFileUnchanged (
6871 accessToken ,
6972 spreadsheetId ,
@@ -83,44 +86,51 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
8386 return 'success'
8487 }
8588
86- // Fetch current row count via column A
87- const currentRowCount = await getDataRowCount (
89+ const valueRender = config . valueRenderOption || 'FORMATTED_VALUE'
90+ const dateTimeRender = config . dateTimeRenderOption || 'SERIAL_NUMBER'
91+
92+ const {
93+ rowCount : currentRowCount ,
94+ headers,
95+ headerRowIndex,
96+ } = await fetchSheetState (
8897 accessToken ,
8998 spreadsheetId ,
9099 sheetName ,
100+ valueRender ,
101+ dateTimeRender ,
91102 requestId ,
92103 logger
93104 )
94105
95106 // First poll: seed state, emit nothing
96- if ( config . lastKnownRowCount === undefined ) {
107+ if ( config . lastIndexChecked === undefined ) {
97108 await updateWebhookProviderConfig (
98109 webhookId ,
99110 {
100- lastKnownRowCount : currentRowCount ,
111+ lastIndexChecked : currentRowCount ,
101112 lastModifiedTime : currentModifiedTime ?? config . lastModifiedTime ,
102113 lastCheckedTimestamp : now . toISOString ( ) ,
103114 } ,
104115 logger
105116 )
106117 await markWebhookSuccess ( webhookId , logger )
107118 logger . info (
108- `[${ requestId } ] First poll for webhook ${ webhookId } , seeded row count : ${ currentRowCount } `
119+ `[${ requestId } ] First poll for webhook ${ webhookId } , seeded row index : ${ currentRowCount } `
109120 )
110121 return 'success'
111122 }
112123
113- // Rows deleted or unchanged
114- if ( currentRowCount <= config . lastKnownRowCount ) {
115- if ( currentRowCount < config . lastKnownRowCount ) {
124+ if ( currentRowCount <= config . lastIndexChecked ) {
125+ if ( currentRowCount < config . lastIndexChecked ) {
116126 logger . warn (
117- `[${ requestId } ] Row count decreased from ${ config . lastKnownRowCount } to ${ currentRowCount } for webhook ${ webhookId } `
127+ `[${ requestId } ] Row count decreased from ${ config . lastIndexChecked } to ${ currentRowCount } for webhook ${ webhookId } `
118128 )
119129 }
120130 await updateWebhookProviderConfig (
121131 webhookId ,
122132 {
123- lastKnownRowCount : currentRowCount ,
133+ lastIndexChecked : currentRowCount ,
124134 lastModifiedTime : currentModifiedTime ?? config . lastModifiedTime ,
125135 lastCheckedTimestamp : now . toISOString ( ) ,
126136 } ,
@@ -131,38 +141,47 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
131141 return 'success'
132142 }
133143
134- // New rows detected
135- const newRowCount = currentRowCount - config . lastKnownRowCount
144+ const newRowCount = currentRowCount - config . lastIndexChecked
136145 const maxRows = config . maxRowsPerPoll || MAX_ROWS_PER_POLL
137146 const rowsToFetch = Math . min ( newRowCount , maxRows )
138- const startRow = config . lastKnownRowCount + 1
139- const endRow = config . lastKnownRowCount + rowsToFetch
147+ const startRow = config . lastIndexChecked + 1
148+ const endRow = config . lastIndexChecked + rowsToFetch
149+
150+ // Skip past the header row (and any blank rows above it) so it is never
151+ // emitted as a data event.
152+ const adjustedStartRow =
153+ headerRowIndex > 0 ? Math . max ( startRow , headerRowIndex + 1 ) : startRow
140154
141155 logger . info (
142- `[${ requestId } ] Found ${ newRowCount } new rows for webhook ${ webhookId } , processing rows ${ startRow } -${ endRow } `
156+ `[${ requestId } ] Found ${ newRowCount } new rows for webhook ${ webhookId } , processing rows ${ adjustedStartRow } -${ endRow } `
143157 )
144158
145- // Resolve render options
146- const valueRender = config . valueRenderOption || 'FORMATTED_VALUE'
147- const dateTimeRender = config . dateTimeRenderOption || 'SERIAL_NUMBER'
148-
149- const headers = await fetchHeaderRow (
150- accessToken ,
151- spreadsheetId ,
152- sheetName ,
153- valueRender ,
154- dateTimeRender ,
155- requestId ,
156- logger
157- )
159+ // Entire batch is header/blank rows — advance pointer and skip fetch.
160+ if ( adjustedStartRow > endRow ) {
161+ const hasRemainingRows = rowsToFetch < newRowCount
162+ await updateWebhookProviderConfig (
163+ webhookId ,
164+ {
165+ lastIndexChecked : config . lastIndexChecked + rowsToFetch ,
166+ lastModifiedTime : hasRemainingRows
167+ ? config . lastModifiedTime
168+ : ( currentModifiedTime ?? config . lastModifiedTime ) ,
169+ lastCheckedTimestamp : now . toISOString ( ) ,
170+ } ,
171+ logger
172+ )
173+ await markWebhookSuccess ( webhookId , logger )
174+ logger . info (
175+ `[${ requestId } ] Batch ${ startRow } -${ endRow } contained only header/blank rows for webhook ${ webhookId } , advancing pointer`
176+ )
177+ return 'success'
178+ }
158179
159- // Fetch new rows — startRow/endRow are already 1-indexed sheet row numbers
160- // because lastKnownRowCount includes the header row
161180 const newRows = await fetchRowRange (
162181 accessToken ,
163182 spreadsheetId ,
164183 sheetName ,
165- startRow ,
184+ adjustedStartRow ,
166185 endRow ,
167186 valueRender ,
168187 dateTimeRender ,
@@ -173,23 +192,22 @@ export const googleSheetsPollingHandler: PollingProviderHandler = {
173192 const { processedCount, failedCount } = await processRows (
174193 newRows ,
175194 headers ,
176- startRow ,
195+ adjustedStartRow ,
177196 spreadsheetId ,
178197 sheetName ,
179- config ,
180198 webhookData ,
181199 workflowData ,
182200 requestId ,
183201 logger
184202 )
185203
186204 const rowsAdvanced = failedCount > 0 ? 0 : rowsToFetch
187- const newLastKnownRowCount = config . lastKnownRowCount + rowsAdvanced
205+ const newLastIndexChecked = config . lastIndexChecked + rowsAdvanced
188206 const hasRemainingOrFailed = rowsAdvanced < newRowCount
189207 await updateWebhookProviderConfig (
190208 webhookId ,
191209 {
192- lastKnownRowCount : newLastKnownRowCount ,
210+ lastIndexChecked : newLastIndexChecked ,
193211 lastModifiedTime : hasRemainingOrFailed
194212 ? config . lastModifiedTime
195213 : ( currentModifiedTime ?? config . lastModifiedTime ) ,
@@ -256,20 +274,32 @@ async function getDriveFileModifiedTime(
256274 }
257275}
258276
259- async function getDataRowCount (
277+ /**
278+ * Fetches the sheet (A:Z) and returns the row count, auto-detected headers,
279+ * and the 1-indexed header row number in a single API call.
280+ *
281+ * The Sheets API omits trailing empty rows, so `rows.length` equals the last
282+ * non-empty row in columns A–Z. Header detection scans the first
283+ * {@link HEADER_SCAN_ROWS} rows for the first non-empty row. Returns
284+ * `headerRowIndex = 0` when no header is found within the scan window.
285+ */
286+ async function fetchSheetState (
260287 accessToken : string ,
261288 spreadsheetId : string ,
262289 sheetName : string ,
290+ valueRenderOption : ValueRenderOption ,
291+ dateTimeRenderOption : DateTimeRenderOption ,
263292 requestId : string ,
264293 logger : ReturnType < typeof import ( '@sim/logger' ) . createLogger >
265- ) : Promise < number > {
294+ ) : Promise < { rowCount : number ; headers : string [ ] ; headerRowIndex : number } > {
266295 const encodedSheet = encodeURIComponent ( sheetName )
267- // Fetch all rows across columns A–Z with majorDimension=ROWS so the API
268- // returns one entry per row that has ANY non-empty cell. Rows where column A
269- // is empty but other columns have data are included, whereas the previous
270- // column-A-only approach silently missed them. The returned array length
271- // equals the 1-indexed row number of the last row with data.
272- const url = `https://sheets.googleapis.com/v4/spreadsheets/${ spreadsheetId } /values/${ encodedSheet } !A:Z?majorDimension=ROWS&fields=values`
296+ const params = new URLSearchParams ( {
297+ majorDimension : 'ROWS' ,
298+ fields : 'values' ,
299+ valueRenderOption,
300+ dateTimeRenderOption,
301+ } )
302+ const url = `https://sheets.googleapis.com/v4/spreadsheets/${ spreadsheetId } /values/${ encodedSheet } !A:Z?${ params . toString ( ) } `
273303
274304 const response = await fetch ( url , {
275305 headers : { Authorization : `Bearer ${ accessToken } ` } ,
@@ -278,61 +308,32 @@ async function getDataRowCount(
278308 if ( ! response . ok ) {
279309 const status = response . status
280310 const errorData = await response . json ( ) . catch ( ( ) => ( { } ) )
281-
282311 if ( status === 403 || status === 429 ) {
283312 throw new Error (
284313 `Sheets API rate limit (${ status } ) — skipping to retry next poll cycle: ${ JSON . stringify ( errorData ) } `
285314 )
286315 }
287-
288316 throw new Error (
289- `Failed to fetch row count : ${ status } ${ response . statusText } - ${ JSON . stringify ( errorData ) } `
317+ `Failed to fetch sheet state : ${ status } ${ response . statusText } - ${ JSON . stringify ( errorData ) } `
290318 )
291319 }
292320
293321 const data = await response . json ( )
294- // values is [[row1col1, row1col2, ...], [row2col1, ...], ...] when majorDimension=ROWS.
295- // The Sheets API omits trailing empty rows, so the array length is the last
296- // non-empty row index (1-indexed), which is exactly what we need.
297- const rows = data . values as string [ ] [ ] | undefined
298- return rows ?. length ?? 0
299- }
322+ const rows = ( data . values as string [ ] [ ] | undefined ) ?? [ ]
323+ const rowCount = rows . length
300324
301- async function fetchHeaderRow (
302- accessToken : string ,
303- spreadsheetId : string ,
304- sheetName : string ,
305- valueRenderOption : ValueRenderOption ,
306- dateTimeRenderOption : DateTimeRenderOption ,
307- requestId : string ,
308- logger : ReturnType < typeof import ( '@sim/logger' ) . createLogger >
309- ) : Promise < string [ ] > {
310- const encodedSheet = encodeURIComponent ( sheetName )
311- const params = new URLSearchParams ( {
312- fields : 'values' ,
313- valueRenderOption,
314- dateTimeRenderOption,
315- } )
316- const url = `https://sheets.googleapis.com/v4/spreadsheets/${ spreadsheetId } /values/${ encodedSheet } !1:1?${ params . toString ( ) } `
317-
318- const response = await fetch ( url , {
319- headers : { Authorization : `Bearer ${ accessToken } ` } ,
320- } )
321-
322- if ( ! response . ok ) {
323- const status = response . status
324- if ( status === 403 || status === 429 ) {
325- const errorData = await response . json ( ) . catch ( ( ) => ( { } ) )
326- throw new Error (
327- `Sheets API rate limit (${ status } ) fetching header row — skipping to retry next poll cycle: ${ JSON . stringify ( errorData ) } `
328- )
325+ let headers : string [ ] = [ ]
326+ let headerRowIndex = 0
327+ for ( let i = 0 ; i < Math . min ( rows . length , HEADER_SCAN_ROWS ) ; i ++ ) {
328+ const row = rows [ i ]
329+ if ( row ?. some ( ( cell ) => cell !== '' ) ) {
330+ headers = row
331+ headerRowIndex = i + 1
332+ break
329333 }
330- logger . warn ( `[${ requestId } ] Failed to fetch header row, proceeding without headers` )
331- return [ ]
332334 }
333335
334- const data = await response . json ( )
335- return ( data . values ?. [ 0 ] as string [ ] ) ?? [ ]
336+ return { rowCount, headers, headerRowIndex }
336337}
337338
338339async function fetchRowRange (
@@ -361,13 +362,11 @@ async function fetchRowRange(
361362 if ( ! response . ok ) {
362363 const status = response . status
363364 const errorData = await response . json ( ) . catch ( ( ) => ( { } ) )
364-
365365 if ( status === 403 || status === 429 ) {
366366 throw new Error (
367367 `Sheets API rate limit (${ status } ) — skipping to retry next poll cycle: ${ JSON . stringify ( errorData ) } `
368368 )
369369 }
370-
371370 throw new Error (
372371 `Failed to fetch rows ${ startRow } -${ endRow } : ${ status } ${ response . statusText } - ${ JSON . stringify ( errorData ) } `
373372 )
@@ -383,7 +382,6 @@ async function processRows(
383382 startRowIndex : number ,
384383 spreadsheetId : string ,
385384 sheetName : string ,
386- config : GoogleSheetsWebhookConfig ,
387385 webhookData : PollWebhookContext [ 'webhookData' ] ,
388386 workflowData : PollWebhookContext [ 'workflowData' ] ,
389387 requestId : string ,
@@ -394,7 +392,13 @@ async function processRows(
394392
395393 for ( let i = 0 ; i < rows . length ; i ++ ) {
396394 const row = rows [ i ]
397- const rowNumber = startRowIndex + i // startRowIndex is already the 1-indexed sheet row
395+ const rowNumber = startRowIndex + i
396+
397+ // Skip empty rows — don't fire a workflow run with no data.
398+ if ( ! row || row . length === 0 ) {
399+ logger . info ( `[${ requestId } ] Skipping empty row ${ rowNumber } for webhook ${ webhookData . id } ` )
400+ continue
401+ }
398402
399403 try {
400404 await pollingIdempotency . executeWithIdempotency (
0 commit comments