-
Notifications
You must be signed in to change notification settings - Fork 83
Infer schema for relevant data sources #344
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ea741fe
558a6a6
ba09d45
3a3f5a1
e151efd
ec38311
6e9d64e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -191,8 +191,8 @@ function sourceCache(loadSource) { | |
const loadTableDataSource = sourceCache(async (source, name) => { | ||
if (source instanceof FileAttachment) { | ||
switch (source.mimeType) { | ||
case "text/csv": return source.csv({typed: true}); | ||
case "text/tab-separated-values": return source.tsv({typed: true}); | ||
case "text/csv": return source.csv(); | ||
case "text/tab-separated-values": return source.tsv(); | ||
case "application/json": return source.json(); | ||
case "application/x-sqlite3": return source.sqlite(); | ||
} | ||
|
@@ -543,8 +543,9 @@ export function getTypeValidator(colType) { | |
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table | ||
// function to do table operations on in-memory data? | ||
export function __table(source, operations) { | ||
const input = source; | ||
let {schema, columns} = source; | ||
if (!schema || !isValidSchema(schema)) source.schema = inferSchema(source); | ||
const input = source; | ||
let primitive = arrayIsPrimitive(source); | ||
if (primitive) source = Array.from(source, (value) => ({value})); | ||
for (const {type, operands} of operations.filter) { | ||
|
@@ -666,3 +667,89 @@ export function __table(source, operations) { | |
} | ||
return source; | ||
} | ||
|
||
function isValidSchema(schema) { | ||
if (!schema || !Array.isArray(schema)) return; | ||
return schema.every((s) => { | ||
s && typeof s.name === "string" && typeof s.type === "string"; | ||
}); | ||
} | ||
|
||
function initKey() { | ||
return { | ||
other: 0, | ||
boolean: 0, | ||
integer: 0, | ||
number: 0, | ||
date: 0, | ||
string: 0, | ||
array: 0, | ||
object: 0, | ||
bigint: 0, // TODO for csv, tsv? | ||
buffer: 0 | ||
}; | ||
} | ||
|
||
function inferSchema(source) { | ||
const schema = []; | ||
const sampleSize = 100; | ||
let sample = source.slice(0, sampleSize); | ||
if (arrayIsPrimitive(sample)) { | ||
sample = sample.map((d) => { | ||
return {value: d}; | ||
}); | ||
} | ||
const typeCounts = {}; | ||
for (const d of sample) { | ||
for (const key in d) { | ||
if (!typeCounts[key]) typeCounts[key] = initKey(); | ||
// for json and sqlite, we already have some types, but for csv and tsv, all | ||
// columns are strings here. | ||
const type = typeof d[key]; | ||
const value = type === "string" ? d[key].trim() : d[key]; | ||
if (value === null || value === undefined || value.length === 0) | ||
typeCounts[key].other++; | ||
else if (type !== "string") { | ||
if (Array.isArray(value)) typeCounts[key].array++; | ||
else if (value instanceof Date) typeCounts[key].date++; | ||
else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; | ||
else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object | ||
} else { | ||
if (value === "true" || value === "false") typeCounts[key].boolean++; | ||
else if (!isNaN(value)) { | ||
if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; | ||
else typeCounts[key].number++; | ||
} else if ( | ||
value.match( | ||
/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ | ||
) | ||
) | ||
typeCounts[key].date++; | ||
else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to combine these into a single refer if possible. |
||
typeCounts[key].date++; | ||
else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/)) | ||
typeCounts[key].date++; | ||
else typeCounts[key].string++; | ||
} | ||
} | ||
} | ||
for (const col in typeCounts) { | ||
// sort descending so most commonly encoutered type is first | ||
const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: If we remove the special treatment of “other” here, we could probably use d3.greatest to get the most common type rather than needing the more expensive sort. Though it’s probably not noticeable since the set of possible types is small. |
||
return typeCounts[col][b] - typeCounts[col][a]; | ||
}); | ||
let type = typesSorted[0]; | ||
if (type === "other") { | ||
// take the next-most-encountered type if most are "other", but only if | ||
// its tally is greater than the next one in the list | ||
if (typeCounts[typesSorted[1]] > typeCounts[typesSorted[2]]) | ||
type = typesSorted[1]; | ||
// else we could iterate over the sample and use the first encountered type | ||
} | ||
schema.push({ | ||
name: col, | ||
type: type | ||
}); | ||
} | ||
return schema; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is mutating the source.schema which will be visible externally and we should avoid because mutation of inputs is an unexpected side-effect of calling this function.
If necessary we can use a WeakMap to instead associate sources with valid schemas if we don’t want to re-infer them repeatedly. (It might also make sense to move this schema inference earlier, say in loadDataSource? Not sure though.)