Skip to content

Infer schema for relevant data sources #344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 90 additions & 3 deletions src/table.js
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ function sourceCache(loadSource) {
const loadTableDataSource = sourceCache(async (source, name) => {
if (source instanceof FileAttachment) {
switch (source.mimeType) {
case "text/csv": return source.csv({typed: true});
case "text/tab-separated-values": return source.tsv({typed: true});
case "text/csv": return source.csv();
case "text/tab-separated-values": return source.tsv();
case "application/json": return source.json();
case "application/x-sqlite3": return source.sqlite();
}
Expand Down Expand Up @@ -543,8 +543,9 @@ export function getTypeValidator(colType) {
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
// function to do table operations on in-memory data?
export function __table(source, operations) {
const input = source;
let {schema, columns} = source;
if (!schema || !isValidSchema(schema)) source.schema = inferSchema(source);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is mutating the source.schema which will be visible externally and we should avoid because mutation of inputs is an unexpected side-effect of calling this function.

If necessary we can use a WeakMap to instead associate sources with valid schemas if we don’t want to re-infer them repeatedly. (It might also make sense to move this schema inference earlier, say in loadDataSource? Not sure though.)

const input = source;
let primitive = arrayIsPrimitive(source);
if (primitive) source = Array.from(source, (value) => ({value}));
for (const {type, operands} of operations.filter) {
Expand Down Expand Up @@ -666,3 +667,89 @@ export function __table(source, operations) {
}
return source;
}

function isValidSchema(schema) {
if (!schema || !Array.isArray(schema)) return;
return schema.every((s) => {
s && typeof s.name === "string" && typeof s.type === "string";
});
}

function initKey() {
return {
other: 0,
boolean: 0,
integer: 0,
number: 0,
date: 0,
string: 0,
array: 0,
object: 0,
bigint: 0, // TODO for csv, tsv?
buffer: 0
};
}

function inferSchema(source) {
const schema = [];
const sampleSize = 100;
let sample = source.slice(0, sampleSize);
if (arrayIsPrimitive(sample)) {
sample = sample.map((d) => {
return {value: d};
});
}
const typeCounts = {};
for (const d of sample) {
for (const key in d) {
if (!typeCounts[key]) typeCounts[key] = initKey();
// for json and sqlite, we already have some types, but for csv and tsv, all
// columns are strings here.
const type = typeof d[key];
const value = type === "string" ? d[key].trim() : d[key];
if (value === null || value === undefined || value.length === 0)
typeCounts[key].other++;
else if (type !== "string") {
if (Array.isArray(value)) typeCounts[key].array++;
else if (value instanceof Date) typeCounts[key].date++;
else if (value instanceof ArrayBuffer) typeCounts[key].buffer++;
else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object
} else {
if (value === "true" || value === "false") typeCounts[key].boolean++;
else if (!isNaN(value)) {
if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++;
else typeCounts[key].number++;
} else if (
value.match(
/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/
)
)
typeCounts[key].date++;
else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to combine these into a single refer if possible.

typeCounts[key].date++;
else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/))
typeCounts[key].date++;
else typeCounts[key].string++;
}
}
}
for (const col in typeCounts) {
// sort descending so most commonly encoutered type is first
const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: If we remove the special treatment of “other” here, we could probably use d3.greatest to get the most common type rather than needing the more expensive sort. Though it’s probably not noticeable since the set of possible types is small.

return typeCounts[col][b] - typeCounts[col][a];
});
let type = typesSorted[0];
if (type === "other") {
// take the next-most-encountered type if most are "other", but only if
// its tally is greater than the next one in the list
if (typeCounts[typesSorted[1]] > typeCounts[typesSorted[2]])
type = typesSorted[1];
// else we could iterate over the sample and use the first encountered type
}
schema.push({
name: col,
type: type
});
}
return schema;
}
32 changes: 23 additions & 9 deletions test/table-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -454,9 +454,10 @@ describe("__table", () => {
const operationsNullColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: null}};
assert.deepStrictEqual(__table(source, operationsNullColumns), source);
const operationsEmptyColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: []}};
assert.deepStrictEqual(__table(source, operationsEmptyColumns), [{}, {}, {}]);
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(__table(source, operationsEmptyColumns).slice(), [{}, {}, {}]);
const operationsSelectedColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: ["a"]}};
assert.deepStrictEqual(__table(source, operationsSelectedColumns), [{a: 1}, {a: 2}, {a: 3}]);
assert.deepStrictEqual(__table(source, operationsSelectedColumns).slice(), [{a: 1}, {a: 2}, {a: 3}]);
});

it("__table unknown filter", () => {
Expand All @@ -480,7 +481,8 @@ describe("__table", () => {
{type: "gt", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2}]}
]
};
assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]);
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]);
});

it("__table filter lte + gte", () => {
Expand All @@ -496,7 +498,8 @@ describe("__table", () => {
{type: "gte", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2.5}]}
]
};
assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]);
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]);
});

it("__table filter primitive lte + gte", () => {
Expand Down Expand Up @@ -526,8 +529,9 @@ describe("__table", () => {
[{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}]
);
const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]};
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(
__table(source, operationsAsc),
__table(source, operationsAsc).slice(),
[{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]
);
const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}];
Expand All @@ -549,8 +553,9 @@ describe("__table", () => {
[{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null}]
);
const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]};
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(
__table(sourceWithMissing, operationsAsc),
__table(sourceWithMissing, operationsAsc).slice(),
[{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null}]
);
});
Expand All @@ -561,8 +566,9 @@ describe("__table", () => {
__table(source, operations),
[{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}]
);
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(
source,
source.slice(),
[{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]
);
});
Expand All @@ -571,9 +577,10 @@ describe("__table", () => {
const operationsToNull = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: null}};
assert.deepStrictEqual(__table(source, operationsToNull), [{a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]);
const operationsFromNull = {...EMPTY_TABLE_DATA.operations, slice: {from: null, to: 1}};
assert.deepStrictEqual(__table(source, operationsFromNull), [{a: 1, b: 2, c: 3}]);
// comparing the result of .slice() removes schema from the comparison
assert.deepStrictEqual(__table(source, operationsFromNull).slice(), [{a: 1, b: 2, c: 3}]);
const operations = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: 2}};
assert.deepStrictEqual(__table(source, operations), [{a: 2, b: 4, c: 6}]);
assert.deepStrictEqual(__table(source, operations).slice(), [{a: 2, b: 4, c: 6}]);
});

it("__table retains schema and columns info", () => {
Expand All @@ -585,6 +592,13 @@ describe("__table", () => {
[{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}]
);
});

it("__table infers schema", () => {
assert.deepStrictEqual(
__table(source, EMPTY_TABLE_DATA.operations).schema,
[{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}]
);
});
});

describe("getTypeValidator filters accurately", () => {
Expand Down