observablehq · libbey-observable · Jan 19, 2023 · Jan 19, 2023 · Jan 20, 2023 · Jan 20, 2023
diff --git a/src/table.js b/src/table.js
@@ -191,8 +191,8 @@ function sourceCache(loadSource) {
 const loadTableDataSource = sourceCache(async (source, name) => {
   if (source instanceof FileAttachment) {
     switch (source.mimeType) {
-      case "text/csv": return source.csv({typed: true});
-      case "text/tab-separated-values": return source.tsv({typed: true});
+      case "text/csv": return source.csv();
+      case "text/tab-separated-values": return source.tsv();
       case "application/json": return source.json();
       case "application/x-sqlite3": return source.sqlite();
     }
@@ -543,8 +543,9 @@ export function getTypeValidator(colType) {
 // DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
 // function to do table operations on in-memory data?
 export function __table(source, operations) {
-  const input = source;
   let {schema, columns} = source;
+  if (!schema || !isValidSchema(schema)) source.schema = inferSchema(source);
+  const input = source;
   let primitive = arrayIsPrimitive(source);
   if (primitive) source = Array.from(source, (value) => ({value}));
   for (const {type, operands} of operations.filter) {
@@ -666,3 +667,89 @@ export function __table(source, operations) {
   }
   return source;
 }
+
+function isValidSchema(schema) {
+  if (!schema || !Array.isArray(schema)) return;
+  return schema.every((s) => {
+    s && typeof s.name === "string" && typeof s.type === "string";
+  });
+}
+
+function initKey() {
+  return {
+    other: 0,
+    boolean: 0,
+    integer: 0,
+    number: 0,
+    date: 0,
+    string: 0,
+    array: 0,
+    object: 0,
+    bigint: 0, // TODO for csv, tsv?
+    buffer: 0
+  };
+}
+
+function inferSchema(source) {
+  const schema = [];
+  const sampleSize = 100;
+  let sample = source.slice(0, sampleSize);
+  if (arrayIsPrimitive(sample)) {
+    sample = sample.map((d) => {
+      return {value: d};
+    });
+  }
+  const typeCounts = {};
+  for (const d of sample) {
+    for (const key in d) {
+      if (!typeCounts[key]) typeCounts[key] = initKey();
+      // for json and sqlite, we already have some types, but for csv and tsv, all
+      // columns are strings here.
+      const type = typeof d[key];
+      const value = type === "string" ? d[key].trim() : d[key];
+      if (value === null || value === undefined || value.length === 0)
+        typeCounts[key].other++;
+      else if (type !== "string") {
+        if (Array.isArray(value)) typeCounts[key].array++;
+        else if (value instanceof Date) typeCounts[key].date++;
+        else if (value instanceof ArrayBuffer) typeCounts[key].buffer++;
+        else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object
+      } else {
+        if (value === "true" || value === "false") typeCounts[key].boolean++;
+        else if (!isNaN(value)) {
+          if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++;
+          else typeCounts[key].number++;
+        } else if (
+          value.match(
+            /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/
+          )
+        )
+          typeCounts[key].date++;
+        else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/))
+          typeCounts[key].date++;
+        else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/))
+          typeCounts[key].date++;
+        else typeCounts[key].string++;
+      }
+    }
+  }
+  for (const col in typeCounts) {
+    // sort descending so most commonly encoutered type is first
+    const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) {
+      return typeCounts[col][b] - typeCounts[col][a];
+    });
+    let type = typesSorted[0];
+    if (type === "other") {
+      // take the next-most-encountered type if most are "other", but only if
+      // its tally is greater than the next one in the list
+      if (typeCounts[typesSorted[1]] > typeCounts[typesSorted[2]])
+        type = typesSorted[1];
+      // else we could iterate over the sample and use the first encountered type
+    }
+    schema.push({
+      name: col,
+      type: type
+    });
+  }
+  return schema;
+}
diff --git a/test/table-test.js b/test/table-test.js
@@ -454,9 +454,10 @@ describe("__table", () => {
     const operationsNullColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: null}};
     assert.deepStrictEqual(__table(source, operationsNullColumns), source);
     const operationsEmptyColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: []}};
-    assert.deepStrictEqual(__table(source, operationsEmptyColumns), [{}, {}, {}]);
+    // comparing the result of .slice() removes schema from the comparison
+    assert.deepStrictEqual(__table(source, operationsEmptyColumns).slice(), [{}, {}, {}]);
     const operationsSelectedColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: ["a"]}};
-    assert.deepStrictEqual(__table(source, operationsSelectedColumns), [{a: 1}, {a: 2}, {a: 3}]);
+    assert.deepStrictEqual(__table(source, operationsSelectedColumns).slice(), [{a: 1}, {a: 2}, {a: 3}]);
   });
 
   it("__table unknown filter", () => {
@@ -480,7 +481,8 @@ describe("__table", () => {
         {type: "gt", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2}]}
       ]
     };
-    assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]);
+    // comparing the result of .slice() removes schema from the comparison
+    assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]);
   });
 
   it("__table filter lte + gte", () => {
@@ -496,7 +498,8 @@ describe("__table", () => {
         {type: "gte", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2.5}]}
       ]
     };
-    assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]);
+    // comparing the result of .slice() removes schema from the comparison
+    assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]);
   });
 
   it("__table filter primitive lte + gte", () => {
@@ -526,8 +529,9 @@ describe("__table", () => {
       [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}]
     );
     const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]};
+    // comparing the result of .slice() removes schema from the comparison
     assert.deepStrictEqual(
-      __table(source, operationsAsc),
+      __table(source, operationsAsc).slice(),
       [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]
     );
     const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}];
@@ -549,8 +553,9 @@ describe("__table", () => {
       [{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null}]
     );
     const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]};
+    // comparing the result of .slice() removes schema from the comparison
     assert.deepStrictEqual(
-      __table(sourceWithMissing, operationsAsc),
+      __table(sourceWithMissing, operationsAsc).slice(),
       [{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null}]
     );
   });
@@ -561,8 +566,9 @@ describe("__table", () => {
       __table(source, operations),
       [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}]
     );
+    // comparing the result of .slice() removes schema from the comparison
     assert.deepStrictEqual(
-      source,
+      source.slice(),
       [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]
     );
   });
@@ -571,9 +577,10 @@ describe("__table", () => {
     const operationsToNull = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: null}};
     assert.deepStrictEqual(__table(source, operationsToNull), [{a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]);
     const operationsFromNull = {...EMPTY_TABLE_DATA.operations, slice: {from: null, to: 1}};
-    assert.deepStrictEqual(__table(source, operationsFromNull), [{a: 1, b: 2, c: 3}]);
+    // comparing the result of .slice() removes schema from the comparison
+    assert.deepStrictEqual(__table(source, operationsFromNull).slice(), [{a: 1, b: 2, c: 3}]);
     const operations = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: 2}};
-    assert.deepStrictEqual(__table(source, operations), [{a: 2, b: 4, c: 6}]);
+    assert.deepStrictEqual(__table(source, operations).slice(), [{a: 2, b: 4, c: 6}]);
   });
 
   it("__table retains schema and columns info", () => {
@@ -585,6 +592,13 @@ describe("__table", () => {
       [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}]
     );
   });
+
+  it("__table infers schema", () => {
+    assert.deepStrictEqual(
+      __table(source, EMPTY_TABLE_DATA.operations).schema,
+      [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}]
+    );
+  });
 });
 
 describe("getTypeValidator filters accurately", () => {