Skip to content

Commit 77e77c5

Browse files
mkfreemanmbostock
andauthored
Add DuckDBClient as a recommended library (#310)
* Add sketch of duckdb * Expose entire DuckDBClient * Beautify file * DuckDBClient * default query.castTimestampToDate * optimize queryRow * use arrow types for schema * Apache Arrow for DuckDBClient Co-authored-by: Mike Bostock <[email protected]>
1 parent 7f0f870 commit 77e77c5

File tree

10 files changed

+367
-10
lines changed

10 files changed

+367
-10
lines changed

bin/resolve-dependencies

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,11 @@ const mains = ["unpkg", "jsdelivr", "browser", "main"];
6464
}
6565
{
6666
const package = await resolve("apache-arrow@4");
67-
console.log(`export const arrow = dependency("${package.name}", "${package.version}", "${package.export}");`);
67+
console.log(`export const arrow4 = dependency("${package.name}", "${package.version}", "${package.export}");`);
68+
}
69+
{
70+
const package = await resolve("apache-arrow@9");
71+
console.log(`export const arrow9 = dependency("${package.name}", "${package.version}", "+esm");`);
6872
}
6973
{
7074
const package = await resolve("arquero");
@@ -86,6 +90,10 @@ const mains = ["unpkg", "jsdelivr", "browser", "main"];
8690
const package = await resolve("leaflet");
8791
console.log(`export const leaflet = dependency("${package.name}", "${package.version}", "${package.export.replace(/-src\.js$/, ".js")}");`);
8892
}
93+
{
94+
const package = await resolve("@duckdb/duckdb-wasm");
95+
console.log(`export const duckdb = dependency("${package.name}", "${package.version}", "+esm");`);
96+
}
8997
})();
9098

9199
async function resolve(specifier) {

rollup.config.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ export default [
1515
reserved: [
1616
"FileAttachment",
1717
"RequireError",
18+
"DuckDBClient",
1819
"SQLiteDatabaseClient",
1920
"Workbook",
2021
"ZipArchive",

src/arrow.js

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Returns true if the vaue is an Apache Arrow table. This uses a “duck” test
2+
// (instead of strict instanceof) because we want it to work with a range of
3+
// Apache Arrow versions at least 7.0.0 or above.
4+
// https://arrow.apache.org/docs/7.0/js/classes/Arrow_dom.Table.html
5+
export function isArrowTable(value) {
6+
return (
7+
value &&
8+
typeof value.getChild === "function" &&
9+
typeof value.toArray === "function" &&
10+
value.schema &&
11+
Array.isArray(value.schema.fields)
12+
);
13+
}
14+
15+
export function getArrowTableSchema(table) {
16+
return table.schema.fields.map(getArrowFieldSchema);
17+
}
18+
19+
function getArrowFieldSchema(field) {
20+
return {
21+
name: field.name,
22+
type: getArrowType(field.type),
23+
nullable: field.nullable,
24+
databaseType: String(field.type)
25+
};
26+
}
27+
28+
// https://github.com/apache/arrow/blob/89f9a0948961f6e94f1ef5e4f310b707d22a3c11/js/src/enum.ts#L140-L141
29+
function getArrowType(type) {
30+
switch (type.typeId) {
31+
case 2: // Int
32+
return "integer";
33+
case 3: // Float
34+
case 7: // Decimal
35+
return "number";
36+
case 4: // Binary
37+
case 15: // FixedSizeBinary
38+
return "buffer";
39+
case 5: // Utf8
40+
return "string";
41+
case 6: // Bool
42+
return "boolean";
43+
case 8: // Date
44+
case 9: // Time
45+
case 10: // Timestamp
46+
return "date";
47+
case 12: // List
48+
case 16: // FixedSizeList
49+
return "array";
50+
case 13: // Struct
51+
case 14: // Union
52+
return "object";
53+
case 11: // Interval
54+
case 17: // Map
55+
default:
56+
return "other";
57+
}
58+
}

src/dependencies.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ export const sql = dependency("sql.js", "1.7.0", "dist/sql-wasm.js");
1313
export const vega = dependency("vega", "5.22.1", "build/vega.min.js");
1414
export const vegalite = dependency("vega-lite", "5.5.0", "build/vega-lite.min.js");
1515
export const vegaliteApi = dependency("vega-lite-api", "5.0.0", "build/vega-lite-api.min.js");
16-
export const arrow = dependency("apache-arrow", "4.0.1", "Arrow.es2015.min.js");
16+
export const arrow4 = dependency("apache-arrow", "4.0.1", "Arrow.es2015.min.js");
17+
export const arrow9 = dependency("apache-arrow", "9.0.0", "+esm");
1718
export const arquero = dependency("arquero", "4.8.8", "dist/arquero.min.js");
1819
export const topojson = dependency("topojson-client", "3.1.0", "dist/topojson-client.min.js");
1920
export const exceljs = dependency("exceljs", "4.3.0", "dist/exceljs.min.js");
2021
export const mermaid = dependency("mermaid", "9.1.6", "dist/mermaid.min.js");
2122
export const leaflet = dependency("leaflet", "1.8.0", "dist/leaflet.js");
23+
export const duckdb = dependency("@duckdb/duckdb-wasm", "1.17.0", "+esm");

src/duckdb.js

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
import {getArrowTableSchema, isArrowTable} from "./arrow.js";
2+
import {arrow9 as arrow, duckdb} from "./dependencies.js";
3+
import {FileAttachment} from "./fileAttachment.js";
4+
import {cdn} from "./require.js";
5+
6+
// Adapted from https://observablehq.com/@cmudig/duckdb-client
7+
// Copyright 2021 CMU Data Interaction Group
8+
//
9+
// Redistribution and use in source and binary forms, with or without
10+
// modification, are permitted provided that the following conditions are met:
11+
//
12+
// 1. Redistributions of source code must retain the above copyright notice,
13+
// this list of conditions and the following disclaimer.
14+
//
15+
// 2. Redistributions in binary form must reproduce the above copyright notice,
16+
// this list of conditions and the following disclaimer in the documentation
17+
// and/or other materials provided with the distribution.
18+
//
19+
// 3. Neither the name of the copyright holder nor the names of its contributors
20+
// may be used to endorse or promote products derived from this software
21+
// without specific prior written permission.
22+
//
23+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
27+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33+
// POSSIBILITY OF SUCH DAMAGE.
34+
35+
export class DuckDBClient {
36+
constructor(db) {
37+
Object.defineProperties(this, {
38+
_db: {value: db}
39+
});
40+
}
41+
42+
async queryStream(query, params) {
43+
const connection = await this._db.connect();
44+
let reader, batch;
45+
try {
46+
reader = await connection.send(query, params);
47+
batch = await reader.next();
48+
if (batch.done) throw new Error("missing first batch");
49+
} catch (error) {
50+
await connection.close();
51+
throw error;
52+
}
53+
return {
54+
schema: getArrowTableSchema(batch.value),
55+
async *readRows() {
56+
try {
57+
while (!batch.done) {
58+
yield batch.value.toArray();
59+
batch = await reader.next();
60+
}
61+
} finally {
62+
await connection.close();
63+
}
64+
}
65+
};
66+
}
67+
68+
async query(query, params) {
69+
const result = await this.queryStream(query, params);
70+
const results = [];
71+
for await (const rows of result.readRows()) {
72+
for (const row of rows) {
73+
results.push(row);
74+
}
75+
}
76+
results.schema = result.schema;
77+
return results;
78+
}
79+
80+
async queryRow(query, params) {
81+
const result = await this.queryStream(query, params);
82+
const reader = result.readRows();
83+
try {
84+
const {done, value} = await reader.next();
85+
return done || !value.length ? null : value[0];
86+
} finally {
87+
await reader.return();
88+
}
89+
}
90+
91+
async sql(strings, ...args) {
92+
return await this.query(strings.join("?"), args);
93+
}
94+
95+
queryTag(strings, ...params) {
96+
return [strings.join("?"), params];
97+
}
98+
99+
escape(name) {
100+
return `"${name}"`;
101+
}
102+
103+
async describeTables() {
104+
const tables = await this.query(`SHOW TABLES`);
105+
return tables.map(({name}) => ({name}));
106+
}
107+
108+
async describeColumns({table} = {}) {
109+
const columns = await this.query(`DESCRIBE ${table}`);
110+
return columns.map(({column_name, column_type, null: nullable}) => ({
111+
name: column_name,
112+
type: getDuckDBType(column_type),
113+
nullable: nullable !== "NO",
114+
databaseType: column_type
115+
}));
116+
}
117+
118+
static async of(sources = {}, config = {}) {
119+
const db = await createDuckDB();
120+
if (config.query?.castTimestampToDate === undefined) {
121+
config = {...config, query: {...config.query, castTimestampToDate: true}};
122+
}
123+
await db.open(config);
124+
await Promise.all(
125+
Object.entries(sources).map(async ([name, source]) => {
126+
if (source instanceof FileAttachment) { // bare file
127+
await insertFile(db, name, source);
128+
} else if (isArrowTable(source)) { // bare arrow table
129+
await insertArrowTable(db, name, source);
130+
} else if (Array.isArray(source)) { // bare array of objects
131+
await insertArray(db, name, source);
132+
} else if ("data" in source) { // data + options
133+
const {data, ...options} = source;
134+
if (isArrowTable(data)) {
135+
await insertArrowTable(db, name, data, options);
136+
} else {
137+
await insertArray(db, name, data, options);
138+
}
139+
} else if ("file" in source) { // file + options
140+
const {file, ...options} = source;
141+
await insertFile(db, name, file, options);
142+
} else {
143+
throw new Error(`invalid source: ${source}`);
144+
}
145+
})
146+
);
147+
return new DuckDBClient(db);
148+
}
149+
}
150+
151+
async function insertFile(database, name, file, options) {
152+
const url = await file.url();
153+
if (url.startsWith("blob:")) {
154+
const buffer = await file.arrayBuffer();
155+
await database.registerFileBuffer(file.name, new Uint8Array(buffer));
156+
} else {
157+
await database.registerFileURL(file.name, url);
158+
}
159+
const connection = await database.connect();
160+
try {
161+
switch (file.mimeType) {
162+
case "text/csv":
163+
return await connection.insertCSVFromPath(file.name, {
164+
name,
165+
schema: "main",
166+
...options
167+
});
168+
case "application/json":
169+
return await connection.insertJSONFromPath(file.name, {
170+
name,
171+
schema: "main",
172+
...options
173+
});
174+
default:
175+
if (/\.arrow$/i.test(file.name)) {
176+
const buffer = new Uint8Array(await file.arrayBuffer());
177+
return await connection.insertArrowFromIPCStream(buffer, {
178+
name,
179+
schema: "main",
180+
...options
181+
});
182+
}
183+
if (/\.parquet$/i.test(file.name)) {
184+
return await connection.query(
185+
`CREATE VIEW '${name}' AS SELECT * FROM parquet_scan('${file.name}')`
186+
);
187+
}
188+
throw new Error(`unknown file type: ${file.mimeType}`);
189+
}
190+
} finally {
191+
await connection.close();
192+
}
193+
}
194+
195+
async function insertArrowTable(database, name, table, options) {
196+
const arrow = await loadArrow();
197+
const buffer = arrow.tableToIPC(table);
198+
const connection = await database.connect();
199+
try {
200+
await connection.insertArrowFromIPCStream(buffer, {
201+
name,
202+
schema: "main",
203+
...options
204+
});
205+
} finally {
206+
await connection.close();
207+
}
208+
}
209+
210+
async function insertArray(database, name, array, options) {
211+
const arrow = await loadArrow();
212+
const table = arrow.tableFromJSON(array);
213+
return await insertArrowTable(database, name, table, options);
214+
}
215+
216+
async function createDuckDB() {
217+
const duck = await import(`${cdn}${duckdb.resolve()}`);
218+
const bundle = await duck.selectBundle({
219+
mvp: {
220+
mainModule: `${cdn}${duckdb.resolve("dist/duckdb-mvp.wasm")}`,
221+
mainWorker: `${cdn}${duckdb.resolve("dist/duckdb-browser-mvp.worker.js")}`
222+
},
223+
eh: {
224+
mainModule: `${cdn}${duckdb.resolve("dist/duckdb-eh.wasm")}`,
225+
mainWorker: `${cdn}${duckdb.resolve("dist/duckdb-browser-eh.worker.js")}`
226+
}
227+
});
228+
const logger = new duck.ConsoleLogger();
229+
const worker = await duck.createWorker(bundle.mainWorker);
230+
const db = new duck.AsyncDuckDB(logger, worker);
231+
await db.instantiate(bundle.mainModule);
232+
return db;
233+
}
234+
235+
async function loadArrow() {
236+
return await import(`${cdn}${arrow.resolve()}`);
237+
}
238+
239+
// https://duckdb.org/docs/sql/data_types/overview
240+
function getDuckDBType(type) {
241+
switch (type) {
242+
case "BIGINT":
243+
case "HUGEINT":
244+
case "UBIGINT":
245+
return "bigint";
246+
case "DOUBLE":
247+
case "REAL":
248+
return "number";
249+
case "INTEGER":
250+
case "SMALLINT":
251+
case "TINYINT":
252+
case "USMALLINT":
253+
case "UINTEGER":
254+
case "UTINYINT":
255+
return "integer";
256+
case "BOOLEAN":
257+
return "boolean";
258+
case "DATE":
259+
case "TIMESTAMP":
260+
case "TIMESTAMP WITH TIME ZONE":
261+
return "date";
262+
case "VARCHAR":
263+
case "UUID":
264+
return "string";
265+
// case "BLOB":
266+
// case "INTERVAL":
267+
// case "TIME":
268+
default:
269+
if (/^DECIMAL\(/.test(type)) return "integer";
270+
return "other";
271+
}
272+
}

0 commit comments

Comments
 (0)