diff --git a/Cargo.lock b/Cargo.lock
index f6c10918f40..c180eeb8308 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -199,6 +199,7 @@ dependencies = [
"flate2 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)",
"futures 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)",
"git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"hyper 0.12.25 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -922,6 +923,31 @@ dependencies = [
"tokio-io 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
]
+[[package]]
+name = "handlebars"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
+ "pest 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "pest_derive 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)",
+ "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)",
+ "walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
[[package]]
name = "hex"
version = "0.3.2"
@@ -1974,6 +2000,14 @@ name = "safemem"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
+[[package]]
+name = "same-file"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
[[package]]
name = "schannel"
version = "0.1.13"
@@ -2717,6 +2751,16 @@ name = "void"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
+[[package]]
+name = "walkdir"
+version = "2.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
+ "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
[[package]]
name = "want"
version = "0.0.6"
@@ -2751,6 +2795,14 @@ name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
+[[package]]
+name = "winapi-util"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
@@ -2887,6 +2939,8 @@ dependencies = [
"checksum ghost 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5297b71943dc9fea26a3241b178c140ee215798b7f79f7773fd61683e25bca74"
"checksum git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c7339329bfa14a00223244311560d11f8f489b453fb90092af97f267a6090ab0"
"checksum h2 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "ddb2b25a33e231484694267af28fec74ac63b5ccf51ee2065a5e313b834d836e"
+"checksum handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "df044dd42cdb7e32f28557b661406fc0f2494be75199779998810dbc35030e0d"
+"checksum hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e1de41fb8dba9714efd92241565cdff73f78508c95697dd56787d3cba27e2353"
"checksum hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "805026a5d0141ffc30abb3be3173848ad46a1b1664fe632428479619a3644d77"
"checksum hostname 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "21ceb46a83a85e824ef93669c8b390009623863b5c195d1ba747292c0c72f94e"
"checksum html5ever 0.22.5 (registry+https://github.com/rust-lang/crates.io-index)" = "c213fa6a618dc1da552f54f85cba74b05d8e883c92ec4e89067736938084c26e"
@@ -3006,6 +3060,7 @@ dependencies = [
"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7"
"checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f"
"checksum safemem 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8dca453248a96cb0749e36ccdfe2b0b4e54a61bfef89fb97ec621eb8e0a93dd9"
+"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
"checksum schannel 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "dc1fabf2a7b6483a141426e1afd09ad543520a77ac49bd03c286e7696ccfd77f"
"checksum scheduled-thread-pool 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a2ff3fc5223829be817806c6441279c676e454cc7da608faf03b0ccc09d3889"
"checksum scoped-tls 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f417c22df063e9450888a7561788e9bd46d3bb3c1466435b4eccb903807f147d"
@@ -3090,11 +3145,13 @@ dependencies = [
"checksum vcpkg 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9e0a7d8bed3178a8fb112199d466eeca9ed09a14ba8ad67718179b4fd5487d0b"
"checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051"
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
+"checksum walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9658c94fa8b940eab2250bd5a457f9c48b748420d71293b165c8cdbe2f55f71e"
"checksum want 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "797464475f30ddb8830cc529aaaae648d581f99e2036a928877dfde027ddf6b3"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum wincolor 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
"checksum winutil 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7daf138b6b14196e3830a588acf1e86966c694d3e8fb026fb105b8b5dca07e6e"
diff --git a/Cargo.toml b/Cargo.toml
index 0c540febc50..610276bde3c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,12 +83,14 @@ tokio = "0.1"
hyper = "0.12"
ctrlc = { version = "3.0", features = ["termination"] }
indexmap = "1.0.2"
+handlebars = "2.0.1"
[dev-dependencies]
conduit-test = "0.8"
hyper-tls = "0.3"
lazy_static = "1.0"
tokio-core = "0.1"
+diesel_migrations = { version = "1.3.0", features = ["postgres"] }
[build-dependencies]
dotenv = "0.11"
diff --git a/app/router.js b/app/router.js
index 969fbfc156b..5a5f85cea47 100644
--- a/app/router.js
+++ b/app/router.js
@@ -46,6 +46,7 @@ Router.map(function() {
this.route('category-slugs', { path: 'category_slugs' });
this.route('team', { path: '/teams/:team_id' });
this.route('policies');
+ this.route('data-access');
this.route('confirm', { path: '/confirm/:email_token' });
this.route('catch-all', { path: '*path' });
diff --git a/app/templates/data-access.hbs b/app/templates/data-access.hbs
new file mode 100644
index 00000000000..eb5d5ee0892
--- /dev/null
+++ b/app/templates/data-access.hbs
@@ -0,0 +1,34 @@
+
+ {{svg-jar 'circle-with-i'}}
+
Accessing the Crates.io Data
+
+
+
+ There are several ways of accessing the Crates.io data. You should try the
+ options in the order listed.
+
+
+
+ -
+
+ The crates.io index.
+
+ This git repository is updated by crates.io, and it is used
+ by Cargo to speed up local dependency resolution. It contains the majority
+ of the data exposed by crates.io and is cheap to clone and get updates.
+
+ -
+ The database dumps (experimental). The dump contains all information
+ exposed by the API in a single download. It is updated every 24 hours.
+ The latest dump is available at the address
+ https://static.crates.io/db-dump.tar.gz.
+ Information on using the dump is contained in the tarball.
+
+ -
+ Crawl the crates.io API. This should be used as a last resort, and
+ doing so is subject to our {{#link-to 'policies'}}crawling policy{{/link-to}}.
+ If the index and the database dumps do not satisfy your needs, we're happy to
+ discuss solutions that don't require you to crawl the registry.
+ You can email us at help@crates.io.
+
+
diff --git a/app/templates/policies.hbs b/app/templates/policies.hbs
index 3e99e6d441a..3f24f3cc560 100644
--- a/app/templates/policies.hbs
+++ b/app/templates/policies.hbs
@@ -112,15 +112,8 @@
- Before resorting to crawling crates.io, you should first see if you are able to
- gather the information you need from the
- crates.io index,
- which is a public git repository containing the majority
- of the information availble through our API.
-
- If the index does not have the information you need, we're also happy to
- discuss solutions to your needs that don't require you to crawl the registry.
- You can email us at help@crates.io.
+ Before resorting to crawling crates.io, please read
+ {{#link-to 'data-access'}}Accessing the Crates.io Data{{/link-to}}.
diff --git a/migrations/2017-10-08-193512_category_trees/up.sql b/migrations/2017-10-08-193512_category_trees/up.sql
index 579160446b4..0fe64abba14 100644
--- a/migrations/2017-10-08-193512_category_trees/up.sql
+++ b/migrations/2017-10-08-193512_category_trees/up.sql
@@ -1,5 +1,4 @@
--- Your SQL goes here
-CREATE EXTENSION ltree;
+CREATE EXTENSION IF NOT EXISTS ltree;
-- Create the new column which will represent our category tree.
-- Fill it with values from `slug` column and then set to non-null
diff --git a/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql b/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql
index f188a9cd166..8b38c66cb4e 100644
--- a/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql
+++ b/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql
@@ -1,2 +1,2 @@
-CREATE EXTENSION pg_trgm;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE INDEX index_crates_name_tgrm ON crates USING gin (canon_crate_name(name) gin_trgm_ops);
diff --git a/src/bin/enqueue-job.rs b/src/bin/enqueue-job.rs
index 5a3494acdf0..290219421a8 100644
--- a/src/bin/enqueue-job.rs
+++ b/src/bin/enqueue-job.rs
@@ -1,17 +1,29 @@
-use cargo_registry::util::{CargoError, CargoResult};
-use cargo_registry::{db, tasks};
-use std::env::args;
-use swirl::Job;
+use cargo_registry::util::{human, CargoError, CargoResult};
+use cargo_registry::{db, env, tasks};
+use diesel::PgConnection;
fn main() -> CargoResult<()> {
let conn = db::connect_now()?;
+ let mut args = std::env::args().skip(1);
+ match &*args.next().unwrap_or_default() {
+ "update_downloads" => tasks::update_downloads().enqueue(&conn),
+ "dump_db" => {
+ let database_url = args.next().unwrap_or_else(|| env("DATABASE_URL"));
+ let target_name = args
+ .next()
+ .unwrap_or_else(|| String::from("db-dump.tar.gz"));
+ tasks::dump_db(database_url, target_name).enqueue(&conn)
+ }
+ other => Err(human(&format!("Unrecognized job type `{}`", other))),
+ }
+}
- match &*args().nth(1).unwrap_or_default() {
- "update_downloads" => tasks::update_downloads()
- .enqueue(&conn)
- .map_err(|e| CargoError::from_std_error(e))?,
- other => panic!("Unrecognized job type `{}`", other),
- };
-
- Ok(())
+/// Helper to map the `PerformError` returned by `swirl::Job::enqueue()` to a
+/// `CargoError`. Can be removed once `map_err()` isn't needed any more.
+trait Enqueue: swirl::Job {
+ fn enqueue(self, conn: &PgConnection) -> CargoResult<()> {
+ ::enqueue(self, conn).map_err(|e| CargoError::from_std_error(e))
+ }
}
+
+impl Enqueue for J {}
diff --git a/src/tasks.rs b/src/tasks.rs
index 930f83bfc11..ed9e0e91449 100644
--- a/src/tasks.rs
+++ b/src/tasks.rs
@@ -1,3 +1,5 @@
+pub mod dump_db;
mod update_downloads;
+pub use dump_db::dump_db;
pub use update_downloads::update_downloads;
diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs
new file mode 100644
index 00000000000..b6c36a3a966
--- /dev/null
+++ b/src/tasks/dump_db.rs
@@ -0,0 +1,172 @@
+use std::{
+ fs::File,
+ path::{Path, PathBuf},
+};
+
+use crate::{background_jobs::Environment, uploaders::Uploader, util::errors::std_error_no_send};
+
+use swirl::PerformError;
+
+/// Create CSV dumps of the public information in the database, wrap them in a
+/// tarball and upload to S3.
+#[swirl::background_job]
+pub fn dump_db(
+ env: &Environment,
+ database_url: String,
+ target_name: String,
+) -> Result<(), PerformError> {
+ let directory = DumpDirectory::create()?;
+ directory.populate(&database_url)?;
+ let tarball = DumpTarball::create(&directory.export_dir)?;
+ tarball.upload(&target_name, &env.uploader)?;
+ println!("Database dump uploaded to {}.", &target_name);
+ Ok(())
+}
+
+/// Manage the export directory.
+///
+/// Create the directory, populate it with the psql scripts and CSV dumps, and
+/// make sure it gets deleted again even in the case of an error.
+#[derive(Debug)]
+pub struct DumpDirectory {
+ pub timestamp: chrono::DateTime,
+ pub export_dir: PathBuf,
+}
+
+impl DumpDirectory {
+ pub fn create() -> Result {
+ let timestamp = chrono::Utc::now();
+ let timestamp_str = timestamp.format("%Y-%m-%d-%H%M%S").to_string();
+ let export_dir = std::env::temp_dir().join("dump-db").join(timestamp_str);
+ std::fs::create_dir_all(&export_dir)?;
+ Ok(Self {
+ timestamp,
+ export_dir,
+ })
+ }
+
+ pub fn populate(&self, database_url: &str) -> Result<(), PerformError> {
+ self.add_readme()?;
+ self.add_metadata()?;
+ self.dump_schema(database_url)?;
+ self.dump_db(database_url)
+ }
+
+ fn add_readme(&self) -> Result<(), PerformError> {
+ use std::io::Write;
+
+ let mut readme = File::create(self.export_dir.join("README.md"))?;
+ readme.write_all(include_bytes!("dump_db/readme_for_tarball.md"))?;
+ Ok(())
+ }
+
+ fn add_metadata(&self) -> Result<(), PerformError> {
+ #[derive(Serialize)]
+ struct Metadata<'a> {
+ timestamp: &'a chrono::DateTime,
+ crates_io_commit: String,
+ }
+ let metadata = Metadata {
+ timestamp: &self.timestamp,
+ crates_io_commit: dotenv::var("HEROKU_SLUG_COMMIT")
+ .unwrap_or_else(|_| "unknown".to_owned()),
+ };
+ let file = File::create(self.export_dir.join("metadata.json"))?;
+ serde_json::to_writer_pretty(file, &metadata)?;
+ Ok(())
+ }
+
+ pub fn dump_schema(&self, database_url: &str) -> Result<(), PerformError> {
+ let schema_sql = File::create(self.export_dir.join("schema.sql"))?;
+ let status = std::process::Command::new("pg_dump")
+ .arg("--schema-only")
+ .arg("--no-owner")
+ .arg("--no-acl")
+ .arg(database_url)
+ .stdout(schema_sql)
+ .spawn()?
+ .wait()?;
+ if !status.success() {
+ return Err("pg_dump did not finish successfully.".into());
+ }
+ Ok(())
+ }
+
+ pub fn dump_db(&self, database_url: &str) -> Result<(), PerformError> {
+ let export_script = self.export_dir.join("export.sql");
+ let import_script = self.export_dir.join("import.sql");
+ gen_scripts::gen_scripts(&export_script, &import_script)?;
+ std::fs::create_dir(self.export_dir.join("data"))?;
+ run_psql(&export_script, database_url)
+ }
+}
+
+impl Drop for DumpDirectory {
+ fn drop(&mut self) {
+ std::fs::remove_dir_all(&self.export_dir).unwrap();
+ }
+}
+
+pub fn run_psql(script: &Path, database_url: &str) -> Result<(), PerformError> {
+ let psql_script = File::open(&script)?;
+ let psql = std::process::Command::new("psql")
+ .arg(database_url)
+ .current_dir(script.parent().unwrap())
+ .stdin(psql_script)
+ .stdout(std::process::Stdio::null())
+ .stderr(std::process::Stdio::piped())
+ .spawn()?;
+ let output = psql.wait_with_output()?;
+ let stderr = String::from_utf8_lossy(&output.stderr);
+ if stderr.contains("ERROR") {
+ return Err(format!("Error while executing psql: {}", stderr).into());
+ }
+ if !output.status.success() {
+ return Err("psql did not finish successfully.".into());
+ }
+ Ok(())
+}
+
+/// Manage the tarball of the database dump.
+///
+/// Create the tarball, upload it to S3, and make sure it gets deleted.
+struct DumpTarball {
+ tarball_path: PathBuf,
+}
+
+impl DumpTarball {
+ fn create(export_dir: &Path) -> Result {
+ let tarball_path = export_dir.with_extension("tar.gz");
+ let tarfile = File::create(&tarball_path)?;
+ let result = Self { tarball_path };
+ let encoder = flate2::write::GzEncoder::new(tarfile, flate2::Compression::default());
+ let mut archive = tar::Builder::new(encoder);
+ archive.append_dir_all(export_dir.file_name().unwrap(), &export_dir)?;
+ Ok(result)
+ }
+
+ fn upload(&self, target_name: &str, uploader: &Uploader) -> Result<(), PerformError> {
+ let client = reqwest::Client::new();
+ let tarfile = File::open(&self.tarball_path)?;
+ let content_length = tarfile.metadata()?.len();
+ // TODO Figure out the correct content type.
+ uploader
+ .upload(
+ &client,
+ target_name,
+ tarfile,
+ content_length,
+ "application/gzip",
+ )
+ .map_err(std_error_no_send)?;
+ Ok(())
+ }
+}
+
+impl Drop for DumpTarball {
+ fn drop(&mut self) {
+ std::fs::remove_file(&self.tarball_path).unwrap();
+ }
+}
+
+mod gen_scripts;
diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml
new file mode 100644
index 00000000000..38318037c95
--- /dev/null
+++ b/src/tasks/dump_db/dump-db.toml
@@ -0,0 +1,211 @@
+# This file configures what to include in public database dumps. For each
+# database table, we set which columns are included in the dump, and optionally
+# how to filter the rows.
+#
+# .columns - a TOML dictionary determining what columns to include.
+# possible values are "private" (not included) and "public" (included).
+#
+# .filter - a string that is a valid SQL expression, which is used
+# in a WHERE clause to filter the rows of the table.
+#
+# .dependencies - an array of table names, used to determine the
+# order of the tables in the generated import script. All tables referred
+# to by public columns in the current table should be listed, to make sure
+# they are imported before this table.
+#
+# .columns_defaults - a TOML dictionary mapping column names to a
+# raw SQL expression that is used as the default value for the column on
+# import. This is useful for private columns that are not nullable and do
+# not have a default.
+
+[api_tokens.columns]
+id = "private"
+user_id = "private"
+token = "private"
+name = "private"
+created_at = "private"
+last_used_at = "private"
+revoked = "private"
+
+[background_jobs.columns]
+id = "private"
+job_type = "private"
+data = "private"
+retries = "private"
+last_retry = "private"
+created_at = "private"
+
+[badges]
+dependencies = ["crates"]
+[badges.columns]
+crate_id = "public"
+badge_type = "public"
+attributes = "public"
+
+[categories.columns]
+id = "public"
+category = "public"
+slug = "public"
+description = "public"
+crates_cnt = "public"
+created_at = "public"
+path = "public"
+
+[crate_owner_invitations.columns]
+invited_user_id = "private"
+invited_by_user_id = "private"
+crate_id = "private"
+created_at = "private"
+
+[crate_owners]
+dependencies = ["crates", "users"]
+filter = "NOT deleted"
+[crate_owners.columns]
+crate_id = "public"
+owner_id = "public"
+created_at = "public"
+created_by = "private"
+deleted = "private"
+updated_at = "public"
+owner_kind = "public"
+
+[crates.columns]
+id = "public"
+name = "public"
+updated_at = "public"
+created_at = "public"
+downloads = "public"
+description = "public"
+homepage = "public"
+documentation = "public"
+readme = "public"
+textsearchable_index_col = "public"
+repository = "public"
+max_upload_size = "public"
+
+[crates_categories]
+dependencies = ["categories", "crates"]
+[crates_categories.columns]
+crate_id = "public"
+category_id = "public"
+
+[crates_keywords]
+dependencies = ["crates", "keywords"]
+[crates_keywords.columns]
+crate_id = "public"
+keyword_id = "public"
+
+[dependencies]
+dependencies = ["crates", "versions"]
+[dependencies.columns]
+id = "public"
+version_id = "public"
+crate_id = "public"
+req = "public"
+optional = "public"
+default_features = "public"
+features = "public"
+target = "public"
+kind = "public"
+
+[__diesel_schema_migrations.columns]
+version = "private"
+run_on = "private"
+
+[emails.columns]
+id = "private"
+user_id = "private"
+email = "private"
+verified = "private"
+token = "private"
+token_generated_at = "private"
+
+[follows.columns]
+user_id = "private"
+crate_id = "private"
+
+[keywords.columns]
+id = "public"
+keyword = "public"
+crates_cnt = "public"
+created_at = "public"
+
+[metadata.columns]
+total_downloads = "public"
+
+[publish_limit_buckets.columns]
+user_id = "private"
+tokens = "private"
+last_refill = "private"
+
+[publish_rate_overrides.columns]
+user_id = "private"
+burst = "private"
+
+[readme_renderings.columns]
+version_id = "private"
+rendered_at = "private"
+
+[reserved_crate_names.columns]
+name = "public"
+
+[teams.columns]
+id = "public"
+login = "public"
+github_id = "public"
+name = "public"
+avatar = "public"
+
+[users]
+filter = """
+id in (
+ SELECT owner_id AS user_id FROM crate_owners WHERE NOT deleted AND owner_kind = 0
+ UNION
+ SELECT published_by as user_id FROM versions
+)"""
+[users.columns]
+id = "public"
+email = "private"
+gh_access_token = "private"
+gh_login = "public"
+name = "public"
+gh_avatar = "public"
+gh_id = "public"
+[users.column_defaults]
+gh_access_token = "''"
+
+[version_authors]
+dependencies = ["versions"]
+[version_authors.columns]
+id = "public"
+version_id = "public"
+user_id = "private"
+name = "public"
+
+[version_downloads]
+dependencies = ["versions"]
+[version_downloads.columns]
+version_id = "public"
+downloads = "public"
+counted = "private"
+date = "public"
+processed = "private"
+
+[versions]
+dependencies = ["crates", "users"]
+[versions.columns]
+id = "public"
+crate_id = "public"
+num = "public"
+updated_at = "public"
+created_at = "public"
+downloads = "public"
+features = "public"
+yanked = "public"
+license = "public"
+crate_size = "public"
+published_by = "public"
+
+[versions_published_by.columns]
+version_id = "private"
+email = "private"
diff --git a/src/tasks/dump_db/dump-export.sql.hbs b/src/tasks/dump_db/dump-export.sql.hbs
new file mode 100644
index 00000000000..0fcf38cba4d
--- /dev/null
+++ b/src/tasks/dump_db/dump-export.sql.hbs
@@ -0,0 +1,21 @@
+BEGIN;
+{{~#each tables}}
+{{~#if this.filter}}
+ CREATE TEMPORARY VIEW "dump_db_{{this.name}}" AS (
+ SELECT {{this.columns}}
+ FROM "{{this.name}}"
+ WHERE {{this.filter}}
+ );
+{{~/if}}
+{{~/each}}
+COMMIT;
+
+BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+{{~#each tables}}
+{{~#if this.filter}}
+ \copy (SELECT * FROM "dump_db_{{this.name}}") TO 'data/{{this.name}}.csv' WITH CSV HEADER
+{{~else}}
+ \copy "{{this.name}}" ({{this.columns}}) TO 'data/{{this.name}}.csv' WITH CSV HEADER
+{{~/if}}
+{{~/each}}
+COMMIT;
diff --git a/src/tasks/dump_db/dump-import.sql.hbs b/src/tasks/dump_db/dump-import.sql.hbs
new file mode 100644
index 00000000000..49508e7e35a
--- /dev/null
+++ b/src/tasks/dump_db/dump-import.sql.hbs
@@ -0,0 +1,25 @@
+BEGIN;
+ -- Set defaults for non-nullable columns not included in the dump.
+{{~#each tables as |table|}}
+{{~#each column_defaults}}
+ ALTER TABLE "{{table.name}}" ALTER COLUMN "{{@key}}" SET DEFAULT {{this}};
+{{~/each}}
+{{~/each}}
+
+ -- Truncate all tables.
+{{~#each tables}}
+ TRUNCATE "{{this.name}}" RESTART IDENTITY CASCADE;
+{{~/each}}
+
+ -- Import the CSV data.
+{{~#each tables}}
+ \copy "{{this.name}}" ({{this.columns}}) FROM 'data/{{this.name}}.csv' WITH CSV HEADER
+{{~/each}}
+
+ -- Drop the defaults again.
+{{~#each tables as |table|}}
+{{~#each column_defaults}}
+ ALTER TABLE "{{table.name}}" ALTER COLUMN "{{@key}}" DROP DEFAULT;
+{{~/each}}
+{{~/each}}
+COMMIT;
diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs
new file mode 100644
index 00000000000..e128a0165cd
--- /dev/null
+++ b/src/tasks/dump_db/gen_scripts.rs
@@ -0,0 +1,265 @@
+use std::{
+ collections::{BTreeMap, VecDeque},
+ fs::File,
+ path::Path,
+};
+
+use swirl::PerformError;
+
+pub fn gen_scripts(export_script: &Path, import_script: &Path) -> Result<(), PerformError> {
+ let config: VisibilityConfig = toml::from_str(include_str!("dump-db.toml")).unwrap();
+ let export_sql = File::create(export_script)?;
+ let import_sql = File::create(import_script)?;
+ config.gen_psql_scripts(export_sql, import_sql)
+}
+
+/// An enum indicating whether a column is included in the database dumps.
+/// Public columns are included, private are not.
+#[derive(Clone, Copy, Debug, Deserialize, PartialEq)]
+#[serde(rename_all = "lowercase")]
+enum ColumnVisibility {
+ Private,
+ Public,
+}
+
+/// Filtering information for a single table. The `dependencies` field is only
+/// used to determine the order of the tables in the generated import script,
+/// and should list all tables the current tables refers to with foreign key
+/// constraints on public columns. The `filter` field is a valid SQL expression
+/// used in a `WHERE` clause to filter the rows of the table. The `columns`
+/// field maps column names to their respective visibilities.
+#[derive(Clone, Debug, Default, Deserialize)]
+struct TableConfig {
+ #[serde(default)]
+ dependencies: Vec,
+ filter: Option,
+ columns: BTreeMap,
+ #[serde(default)]
+ column_defaults: BTreeMap,
+}
+
+/// Subset of the configuration data to be passed on to the Handlbars template.
+#[derive(Debug, Serialize)]
+struct HandlebarsTableContext<'a> {
+ name: &'a str,
+ filter: Option<&'a str>,
+ columns: String,
+ column_defaults: BTreeMap<&'a str, &'a str>,
+}
+
+impl TableConfig {
+ fn handlebars_context<'a>(&'a self, name: &'a str) -> Option> {
+ let columns = self
+ .columns
+ .iter()
+ .filter(|&(_, &vis)| vis == ColumnVisibility::Public)
+ .map(|(col, _)| format!("\"{}\"", col))
+ .collect::>()
+ .join(", ");
+ if columns.is_empty() {
+ None
+ } else {
+ let filter = self.filter.as_ref().map(String::as_str);
+ let column_defaults = self
+ .column_defaults
+ .iter()
+ .map(|(k, v)| (k.as_str(), v.as_str()))
+ .collect();
+ Some(HandlebarsTableContext {
+ name,
+ filter,
+ columns,
+ column_defaults,
+ })
+ }
+ }
+}
+
+/// Maps table names to the respective configurations. Used to load `dump_db.toml`.
+#[derive(Clone, Debug, Default, Deserialize)]
+#[serde(transparent)]
+struct VisibilityConfig(BTreeMap);
+
+/// Subset of the configuration data to be passed on to the Handlbars template.
+#[derive(Debug, Serialize)]
+struct HandlebarsContext<'a> {
+ tables: Vec>,
+}
+
+impl VisibilityConfig {
+ /// Sort the tables in a way that dependencies come before dependent tables.
+ ///
+ /// Returns a vector of table names.
+ fn topological_sort(&self) -> Vec<&str> {
+ let mut result = Vec::new();
+ let mut num_deps = BTreeMap::new();
+ let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new();
+ for (table, config) in self.0.iter() {
+ num_deps.insert(table.as_str(), config.dependencies.len());
+ for dep in &config.dependencies {
+ rev_deps
+ .entry(dep.as_str())
+ .or_default()
+ .push(table.as_str());
+ }
+ }
+ let mut ready: VecDeque<&str> = num_deps
+ .iter()
+ .filter(|(_, &count)| count == 0)
+ .map(|(&table, _)| table)
+ .collect();
+ while let Some(table) = ready.pop_front() {
+ result.push(table);
+ for dep in rev_deps.get(table).iter().cloned().flatten() {
+ *num_deps.get_mut(dep).unwrap() -= 1;
+ if num_deps[dep] == 0 {
+ ready.push_back(dep);
+ }
+ }
+ }
+ assert_eq!(
+ self.0.len(),
+ result.len(),
+ "circular dependencies in database dump configuration detected",
+ );
+ result
+ }
+
+ fn handlebars_context(&self) -> HandlebarsContext<'_> {
+ let tables = self
+ .topological_sort()
+ .into_iter()
+ .filter_map(|table| self.0[table].handlebars_context(table))
+ .collect();
+ HandlebarsContext { tables }
+ }
+
+ fn gen_psql_scripts(&self, export_sql: W, import_sql: W) -> Result<(), PerformError>
+ where
+ W: std::io::Write,
+ {
+ let context = self.handlebars_context();
+ let mut handlebars = handlebars::Handlebars::new();
+ handlebars.register_escape_fn(handlebars::no_escape);
+ handlebars.render_template_to_write(
+ include_str!("dump-export.sql.hbs"),
+ &context,
+ export_sql,
+ )?;
+ handlebars.render_template_to_write(
+ include_str!("dump-import.sql.hbs"),
+ &context,
+ import_sql,
+ )?;
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::test_util::pg_connection;
+ use diesel::prelude::*;
+ use std::collections::HashSet;
+ use std::iter::FromIterator;
+
+ /// Test whether the visibility configuration matches the schema of the
+ /// test database.
+ #[test]
+ fn check_visibility_config() {
+ let conn = pg_connection();
+ let db_columns = HashSet::::from_iter(get_db_columns(&conn));
+ let vis_columns = toml::from_str::(include_str!("dump-db.toml"))
+ .unwrap()
+ .0
+ .iter()
+ .flat_map(|(table, config)| {
+ config.columns.iter().map(move |(column, _)| Column {
+ table_name: table.clone(),
+ column_name: column.clone(),
+ })
+ })
+ .collect();
+ let mut errors = vec![];
+ for Column {
+ table_name,
+ column_name,
+ } in db_columns.difference(&vis_columns)
+ {
+ errors.push(format!(
+ "No visibility information for columns {}.{}.",
+ table_name, column_name
+ ));
+ }
+ for Column {
+ table_name,
+ column_name,
+ } in vis_columns.difference(&db_columns)
+ {
+ errors.push(format!(
+ "Column {}.{} does not exist in the database.",
+ table_name, column_name
+ ));
+ }
+ assert!(
+ errors.is_empty(),
+ "The visibility configuration does not match the database schema:\n{}",
+ errors.join("\n"),
+ );
+ }
+
+ mod information_schema {
+ table! {
+ information_schema.columns (table_schema, table_name, column_name) {
+ table_schema -> Text,
+ table_name -> Text,
+ column_name -> Text,
+ ordinal_position -> Integer,
+ }
+ }
+ }
+
+ #[derive(Debug, Eq, Hash, PartialEq, Queryable)]
+ struct Column {
+ table_name: String,
+ column_name: String,
+ }
+
+ fn get_db_columns(conn: &PgConnection) -> Vec {
+ use information_schema::columns::dsl::*;
+ columns
+ .select((table_name, column_name))
+ .filter(table_schema.eq("public"))
+ .order_by((table_name, ordinal_position))
+ .load(conn)
+ .unwrap()
+ }
+
+ fn table_config_with_deps(deps: &[&str]) -> TableConfig {
+ TableConfig {
+ dependencies: deps.iter().cloned().map(ToOwned::to_owned).collect(),
+ ..Default::default()
+ }
+ }
+
+ #[test]
+ fn test_topological_sort() {
+ let mut config = VisibilityConfig::default();
+ let tables = &mut config.0;
+ tables.insert("a".to_owned(), table_config_with_deps(&["b", "c"]));
+ tables.insert("b".to_owned(), table_config_with_deps(&["c", "d"]));
+ tables.insert("c".to_owned(), table_config_with_deps(&["d"]));
+ config.0.insert("d".to_owned(), table_config_with_deps(&[]));
+ assert_eq!(config.topological_sort(), ["d", "c", "b", "a"]);
+ }
+
+ #[test]
+ #[should_panic]
+ fn topological_sort_panics_for_cyclic_dependency() {
+ let mut config = VisibilityConfig::default();
+ let tables = &mut config.0;
+ tables.insert("a".to_owned(), table_config_with_deps(&["b"]));
+ tables.insert("b".to_owned(), table_config_with_deps(&["a"]));
+ config.topological_sort();
+ }
+}
diff --git a/src/tasks/dump_db/readme_for_tarball.md b/src/tasks/dump_db/readme_for_tarball.md
new file mode 100644
index 00000000000..3d9f431d311
--- /dev/null
+++ b/src/tasks/dump_db/readme_for_tarball.md
@@ -0,0 +1,30 @@
+# crates.io Database Dump
+
+This is a dump of the public information in the crates.io database.
+
+## Files
+
+* `data/` – the CSV files with the actual data.
+* `export.sql` – the `psql` script that was used to create this database dump. It is only included in the archive for reference.
+* `import.sql` – a `psql` script that can be used to restore the dump into a PostgreSQL database with the same schema as the `crates.io` database, destroying all current data.
+* `metadata.json` – some metadata of this dump.
+* `schema.sql` – a dump of the database schema to facilitate generating a new database from the data.
+
+## Metadata Fields
+
+* `timestamp` – the UTC time the dump was started.
+* `crates_io_commit` – the git commit hash of the deployed version of crates.io that created this dump.
+
+## Restoring to a Local crates.io Database
+
+1. Create a new database.
+
+ createdb DATABASE_NAME
+
+2. Restore the database schema.
+
+ psql DATABASE_NAME < schema.sql
+
+3. Run the import script.
+
+ psql DATABASE_URL < import.sql
diff --git a/src/tests/all.rs b/src/tests/all.rs
index 310cdf5db9c..af9108f48bd 100644
--- a/src/tests/all.rs
+++ b/src/tests/all.rs
@@ -52,6 +52,7 @@ mod badge;
mod builders;
mod categories;
mod category;
+mod dump_db;
mod git;
mod keyword;
mod krate;
diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs
new file mode 100644
index 00000000000..11187d6b16f
--- /dev/null
+++ b/src/tests/dump_db.rs
@@ -0,0 +1,71 @@
+use cargo_registry::tasks::dump_db;
+use diesel::{
+ connection::{Connection, SimpleConnection},
+ pg::PgConnection,
+};
+
+#[test]
+fn dump_db_and_reimport_dump() {
+ let database_url = crate::env("TEST_DATABASE_URL");
+
+ // TODO prefill database with some data
+
+ let directory = dump_db::DumpDirectory::create().unwrap();
+ directory.populate(&database_url).unwrap();
+
+ let schema = TemporarySchema::create(database_url, "test_db_dump");
+ schema.run_migrations();
+
+ let import_script = directory.export_dir.join("import.sql");
+ dump_db::run_psql(&import_script, &schema.database_url).unwrap();
+
+ // TODO: Consistency checks on the re-imported data?
+}
+
+struct TemporarySchema {
+ pub database_url: String,
+ pub schema_name: String,
+ pub connection: PgConnection,
+}
+
+impl TemporarySchema {
+ pub fn create(database_url: String, schema_name: &str) -> Self {
+ let params = &[("options", format!("--search_path={},public", schema_name))];
+ let database_url = url::Url::parse_with_params(&database_url, params)
+ .unwrap()
+ .into_string();
+ let schema_name = schema_name.to_owned();
+ let connection = PgConnection::establish(&database_url).unwrap();
+ connection
+ .batch_execute(&format!(
+ r#"DROP SCHEMA IF EXISTS "{schema_name}" CASCADE;
+ CREATE SCHEMA "{schema_name}";"#,
+ schema_name = schema_name,
+ ))
+ .unwrap();
+ Self {
+ database_url,
+ schema_name,
+ connection,
+ }
+ }
+
+ pub fn run_migrations(&self) {
+ use diesel_migrations::{find_migrations_directory, run_pending_migrations_in_directory};
+ let migrations_dir = find_migrations_directory().unwrap();
+ run_pending_migrations_in_directory(
+ &self.connection,
+ &migrations_dir,
+ &mut std::io::sink(),
+ )
+ .unwrap();
+ }
+}
+
+impl Drop for TemporarySchema {
+ fn drop(&mut self) {
+ self.connection
+ .batch_execute(&format!(r#"DROP SCHEMA "{}" CASCADE;"#, self.schema_name))
+ .unwrap();
+ }
+}