diff --git a/src/index.ts b/src/index.ts index 0509ca2..adf1d0b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2141,16 +2141,24 @@ ${JSON.stringify(schema, null, 2)}` const samples = field.samples; const nonNullSamples = samples.filter(s => s !== null && s !== undefined); - // Calculate required percentage - const requiredPercentage = nonNullSamples.length / totalRecords; - const required = requiredPercentage > 0.8; // 80% threshold for required + // Filter out empty/meaningless values for better required field detection + const meaningfulSamples = samples.filter(s => + s !== null && + s !== undefined && + s !== "" && + (typeof s === 'string' ? s.trim() !== "" : true) + ); + + // Calculate required percentage based on meaningful data + const requiredPercentage = meaningfulSamples.length / totalRecords; + const required = requiredPercentage > 0.7; // 70% threshold for required (more lenient) // Detect type let type = 'string'; let size = 255; - if (nonNullSamples.length > 0) { - const firstSample = nonNullSamples[0]; + if (meaningfulSamples.length > 0) { + const firstSample = meaningfulSamples[0]; if (typeof firstSample === 'boolean') { type = 'boolean'; @@ -2158,16 +2166,16 @@ ${JSON.stringify(schema, null, 2)}` type = 'integer'; } else if (typeof firstSample === 'string') { // Check for email pattern - if (nonNullSamples.some(s => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(s))) { + if (meaningfulSamples.some(s => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(s))) { type = 'email'; } // Check for datetime pattern - else if (nonNullSamples.some(s => !isNaN(Date.parse(s)))) { + else if (meaningfulSamples.some(s => !isNaN(Date.parse(s)))) { type = 'datetime'; } else { type = 'string'; - // Calculate max string length - const maxLength = Math.max(...nonNullSamples.map(s => String(s).length)); + // Calculate max string length from meaningful samples + const maxLength = Math.max(...meaningfulSamples.map(s => String(s).length)); size = Math.max(255, Math.ceil(maxLength * 1.2)); // 20% buffer } }