Skip to content

Commit

Permalink
Save downloaded DB archives to disk before unzipping
Browse files Browse the repository at this point in the history
This fixes two classes of DBs that can't be installed directly from
downloading:

1. DBs whose central directories do not align with their file headers.
   We need to download and save the entire archive  before we can read
   the central directory and use that to guide the unzipping.
2. Large DBs require too much memory so can't be downloaded and unzipped
   in a single stream.

We also add proper progress notifications to the download progress
monitor so users are aware of how many more MBs are left to download.

It's not yet possible to do the same for unzipping using the current
unzipper library, since unzipping using the central directory does not
expose a stream.
  • Loading branch information
aeisenberg committed Dec 7, 2020
1 parent 370dbcb commit 9571223
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 50 deletions.
79 changes: 50 additions & 29 deletions extensions/ql-vscode/src/databaseFetcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ import * as path from 'path';

import { DatabaseManager, DatabaseItem } from './databases';
import {
reportStreamProgress,
ProgressCallback,
showAndLogInformationMessage,
} from './helpers';
import { logger } from './logging';
import { tmpDir } from './run-queries';

/**
* Prompts a user to fetch a database from a remote location. Database is assumed to be an archive file.
Expand Down Expand Up @@ -164,7 +166,7 @@ async function databaseArchiveFetcher(
const unzipPath = await getStorageFolder(storagePath, databaseUrl);

if (isFile(databaseUrl)) {
await readAndUnzip(databaseUrl, unzipPath);
await readAndUnzip(databaseUrl, unzipPath, progress);
} else {
await fetchAndUnzip(databaseUrl, unzipPath, progress);
}
Expand Down Expand Up @@ -237,48 +239,67 @@ function validateHttpsUrl(databaseUrl: string) {
}
}

async function readAndUnzip(databaseUrl: string, unzipPath: string) {
const databaseFile = Uri.parse(databaseUrl).fsPath;
const directory = await unzipper.Open.file(databaseFile);
async function readAndUnzip(
zipUrl: string,
unzipPath: string,
progress?: ProgressCallback
) {
// TODO: Providing progress as the file is unzipped is currently blocked
// on https://github.com/ZJONSSON/node-unzipper/issues/222
const zipFile = Uri.parse(zipUrl).fsPath;
progress?.({
maxStep: 10,
step: 9,
message: `Unzipping into ${path.basename(unzipPath)}`
});
// Must get the zip central directory since streaming the
// zip contents may not have correct local file headers.
// Instead, we can only rely on the central directory.
const directory = await unzipper.Open.file(zipFile);
await directory.extract({ path: unzipPath });
}

async function fetchAndUnzip(
databaseUrl: string,
unzipPath: string,
progressCallback?: ProgressCallback
progress?: ProgressCallback
) {
const response = await fetch(databaseUrl);

await checkForFailingResponse(response);
// Although it is possible to download and stream directly to an unzipped directory,
// we need to avoid this for two reasons. The central directory is located at the
// end of the zip file. It is the source of truth of the content locations. Individual
// file headers may be incorrect. Additionally, saving to file first will reduce memory
// pressure compared with unzipping while downloading the archive.

const unzipStream = unzipper.Extract({
path: unzipPath,
});
const archivePath = path.join(tmpDir.name, `archive-${Date.now()}.zip`);

progressCallback?.({
progress?.({
maxStep: 3,
message: 'Unzipping database',
step: 2,
});
await new Promise((resolve, reject) => {
const handler = (err: Error) => {
if (err.message.startsWith('invalid signature')) {
reject(new Error('Not a valid archive.'));
} else {
reject(err);
}
};
response.body.on('error', handler);
unzipStream.on('error', handler);
unzipStream.on('close', resolve);
response.body.pipe(unzipStream);
message: 'Downloading database',
step: 1,
});

const response = await checkForFailingResponse(await fetch(databaseUrl));
const archiveFileStream = fs.createWriteStream(archivePath);

const contentLength = response.headers.get('content-length');
const totalNumBytes = contentLength ? parseInt(contentLength, 10) : undefined;
reportStreamProgress(response.body, 'Downloading database', totalNumBytes, progress);

await new Promise((resolve, reject) =>
response.body.pipe(archiveFileStream)
.on('finish', resolve)
.on('error', reject)
);

await readAndUnzip(Uri.file(archivePath).toString(true), unzipPath, progress);

// remove archivePath eagerly since these archives can be large.
await fs.remove(archivePath);
}

async function checkForFailingResponse(response: Response): Promise<void | never> {
async function checkForFailingResponse(response: Response): Promise<Response | never> {
if (response.ok) {
return;
return response;
}

// An error downloading the database. Attempt to extract the resaon behind it.
Expand Down
23 changes: 2 additions & 21 deletions extensions/ql-vscode/src/distribution.ts
Original file line number Diff line number Diff line change
Expand Up @@ -337,27 +337,8 @@ class ExtensionSpecificDistributionManager {
const archiveFile = fs.createWriteStream(archivePath);

const contentLength = assetStream.headers.get('content-length');
let numBytesDownloaded = 0;

if (progressCallback && contentLength !== null) {
const totalNumBytes = parseInt(contentLength, 10);
const bytesToDisplayMB = (numBytes: number): string => `${(numBytes / (1024 * 1024)).toFixed(1)} MB`;
const updateProgress = (): void => {
progressCallback({
step: numBytesDownloaded,
maxStep: totalNumBytes,
message: `Downloading CodeQL CLI… [${bytesToDisplayMB(numBytesDownloaded)} of ${bytesToDisplayMB(totalNumBytes)}]`,
});
};

// Display the progress straight away rather than waiting for the first chunk.
updateProgress();

assetStream.body.on('data', data => {
numBytesDownloaded += data.length;
updateProgress();
});
}
const totalNumBytes = contentLength ? parseInt(contentLength, 10) : undefined;
helpers.reportStreamProgress(assetStream.body, 'Downloading CodeQL CLI…', totalNumBytes, progressCallback);

await new Promise((resolve, reject) =>
assetStream.body.pipe(archiveFile)
Expand Down
43 changes: 43 additions & 0 deletions extensions/ql-vscode/src/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -530,3 +530,46 @@ export async function isLikelyDatabaseRoot(maybeRoot: string) {
export function isLikelyDbLanguageFolder(dbPath: string) {
return !!path.basename(dbPath).startsWith('db-');
}


/**
* Displays a progress monitor that indicates how much progess has been made
* reading from a stream.
*
* @param readable The stream to read progress from
* @param messagePrefix A prefix for displaying the message
* @param totalNumBytes Total number of bytes in this stream
* @param progress The progress callback used to set messages
*/
export function reportStreamProgress(
readable: NodeJS.ReadableStream,
messagePrefix: string,
totalNumBytes?: number,
progress?: ProgressCallback
) {
if (progress && totalNumBytes) {
let numBytesDownloaded = 0;
const bytesToDisplayMB = (numBytes: number): string => `${(numBytes / (1024 * 1024)).toFixed(1)} MB`;
const updateProgress = () => {
progress({
step: numBytesDownloaded,
maxStep: totalNumBytes,
message: `${messagePrefix} [${bytesToDisplayMB(numBytesDownloaded)} of ${bytesToDisplayMB(totalNumBytes)}]`,
});
};

// Display the progress straight away rather than waiting for the first chunk.
updateProgress();

readable.on('data', data => {
numBytesDownloaded += data.length;
updateProgress();
});
} else if (progress) {
progress({
step: 1,
maxStep: 2,
message: `${messagePrefix} (Size unknown)`,
});
}
}

0 comments on commit 9571223

Please sign in to comment.