diff --git a/Cargo.lock b/Cargo.lock index f6c10918f40..c180eeb8308 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,6 +199,7 @@ dependencies = [ "flate2 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)", "git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "hyper 0.12.25 (registry+https://github.com/rust-lang/crates.io-index)", @@ -922,6 +923,31 @@ dependencies = [ "tokio-io 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "handlebars" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "pest 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_derive 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "hashbrown" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "hex" version = "0.3.2" @@ -1974,6 +2000,14 @@ name = "safemem" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "same-file" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "schannel" version = "0.1.13" @@ -2717,6 +2751,16 @@ name = "void" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "walkdir" +version = "2.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "want" version = "0.0.6" @@ -2751,6 +2795,14 @@ name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "winapi-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -2887,6 +2939,8 @@ dependencies = [ "checksum ghost 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5297b71943dc9fea26a3241b178c140ee215798b7f79f7773fd61683e25bca74" "checksum git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c7339329bfa14a00223244311560d11f8f489b453fb90092af97f267a6090ab0" "checksum h2 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "ddb2b25a33e231484694267af28fec74ac63b5ccf51ee2065a5e313b834d836e" +"checksum handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "df044dd42cdb7e32f28557b661406fc0f2494be75199779998810dbc35030e0d" +"checksum hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e1de41fb8dba9714efd92241565cdff73f78508c95697dd56787d3cba27e2353" "checksum hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "805026a5d0141ffc30abb3be3173848ad46a1b1664fe632428479619a3644d77" "checksum hostname 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "21ceb46a83a85e824ef93669c8b390009623863b5c195d1ba747292c0c72f94e" "checksum html5ever 0.22.5 (registry+https://github.com/rust-lang/crates.io-index)" = "c213fa6a618dc1da552f54f85cba74b05d8e883c92ec4e89067736938084c26e" @@ -3006,6 +3060,7 @@ dependencies = [ "checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" "checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f" "checksum safemem 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8dca453248a96cb0749e36ccdfe2b0b4e54a61bfef89fb97ec621eb8e0a93dd9" +"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421" "checksum schannel 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "dc1fabf2a7b6483a141426e1afd09ad543520a77ac49bd03c286e7696ccfd77f" "checksum scheduled-thread-pool 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a2ff3fc5223829be817806c6441279c676e454cc7da608faf03b0ccc09d3889" "checksum scoped-tls 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f417c22df063e9450888a7561788e9bd46d3bb3c1466435b4eccb903807f147d" @@ -3090,11 +3145,13 @@ dependencies = [ "checksum vcpkg 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9e0a7d8bed3178a8fb112199d466eeca9ed09a14ba8ad67718179b4fd5487d0b" "checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +"checksum walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9658c94fa8b940eab2250bd5a457f9c48b748420d71293b165c8cdbe2f55f71e" "checksum want 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "797464475f30ddb8830cc529aaaae648d581f99e2036a928877dfde027ddf6b3" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" "checksum wincolor 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767" "checksum winutil 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7daf138b6b14196e3830a588acf1e86966c694d3e8fb026fb105b8b5dca07e6e" diff --git a/Cargo.toml b/Cargo.toml index 0c540febc50..610276bde3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,12 +83,14 @@ tokio = "0.1" hyper = "0.12" ctrlc = { version = "3.0", features = ["termination"] } indexmap = "1.0.2" +handlebars = "2.0.1" [dev-dependencies] conduit-test = "0.8" hyper-tls = "0.3" lazy_static = "1.0" tokio-core = "0.1" +diesel_migrations = { version = "1.3.0", features = ["postgres"] } [build-dependencies] dotenv = "0.11" diff --git a/app/router.js b/app/router.js index 969fbfc156b..5a5f85cea47 100644 --- a/app/router.js +++ b/app/router.js @@ -46,6 +46,7 @@ Router.map(function() { this.route('category-slugs', { path: 'category_slugs' }); this.route('team', { path: '/teams/:team_id' }); this.route('policies'); + this.route('data-access'); this.route('confirm', { path: '/confirm/:email_token' }); this.route('catch-all', { path: '*path' }); diff --git a/app/templates/data-access.hbs b/app/templates/data-access.hbs new file mode 100644 index 00000000000..eb5d5ee0892 --- /dev/null +++ b/app/templates/data-access.hbs @@ -0,0 +1,34 @@ +
+ {{svg-jar 'circle-with-i'}} +

Accessing the Crates.io Data

+
+ +

+ There are several ways of accessing the Crates.io data. You should try the + options in the order listed. +

+ +
    +
  1. + + The crates.io index. + + This git repository is updated by crates.io, and it is used + by Cargo to speed up local dependency resolution. It contains the majority + of the data exposed by crates.io and is cheap to clone and get updates. +
  2. +
  3. + The database dumps (experimental). The dump contains all information + exposed by the API in a single download. It is updated every 24 hours. + The latest dump is available at the address + https://static.crates.io/db-dump.tar.gz. + Information on using the dump is contained in the tarball. +
  4. +
  5. + Crawl the crates.io API. This should be used as a last resort, and + doing so is subject to our {{#link-to 'policies'}}crawling policy{{/link-to}}. + If the index and the database dumps do not satisfy your needs, we're happy to + discuss solutions that don't require you to crawl the registry. + You can email us at help@crates.io. +
  6. +
diff --git a/app/templates/policies.hbs b/app/templates/policies.hbs index 3e99e6d441a..3f24f3cc560 100644 --- a/app/templates/policies.hbs +++ b/app/templates/policies.hbs @@ -112,15 +112,8 @@

Crawlers

- Before resorting to crawling crates.io, you should first see if you are able to - gather the information you need from the - crates.io index, - which is a public git repository containing the majority - of the information availble through our API. - - If the index does not have the information you need, we're also happy to - discuss solutions to your needs that don't require you to crawl the registry. - You can email us at help@crates.io. + Before resorting to crawling crates.io, please read + {{#link-to 'data-access'}}Accessing the Crates.io Data{{/link-to}}.

diff --git a/migrations/2017-10-08-193512_category_trees/up.sql b/migrations/2017-10-08-193512_category_trees/up.sql index 579160446b4..0fe64abba14 100644 --- a/migrations/2017-10-08-193512_category_trees/up.sql +++ b/migrations/2017-10-08-193512_category_trees/up.sql @@ -1,5 +1,4 @@ --- Your SQL goes here -CREATE EXTENSION ltree; +CREATE EXTENSION IF NOT EXISTS ltree; -- Create the new column which will represent our category tree. -- Fill it with values from `slug` column and then set to non-null diff --git a/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql b/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql index f188a9cd166..8b38c66cb4e 100644 --- a/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql +++ b/migrations/2019-05-14-165316_index_crate_name_for_like/up.sql @@ -1,2 +1,2 @@ -CREATE EXTENSION pg_trgm; +CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE INDEX index_crates_name_tgrm ON crates USING gin (canon_crate_name(name) gin_trgm_ops); diff --git a/src/bin/enqueue-job.rs b/src/bin/enqueue-job.rs index 5a3494acdf0..290219421a8 100644 --- a/src/bin/enqueue-job.rs +++ b/src/bin/enqueue-job.rs @@ -1,17 +1,29 @@ -use cargo_registry::util::{CargoError, CargoResult}; -use cargo_registry::{db, tasks}; -use std::env::args; -use swirl::Job; +use cargo_registry::util::{human, CargoError, CargoResult}; +use cargo_registry::{db, env, tasks}; +use diesel::PgConnection; fn main() -> CargoResult<()> { let conn = db::connect_now()?; + let mut args = std::env::args().skip(1); + match &*args.next().unwrap_or_default() { + "update_downloads" => tasks::update_downloads().enqueue(&conn), + "dump_db" => { + let database_url = args.next().unwrap_or_else(|| env("DATABASE_URL")); + let target_name = args + .next() + .unwrap_or_else(|| String::from("db-dump.tar.gz")); + tasks::dump_db(database_url, target_name).enqueue(&conn) + } + other => Err(human(&format!("Unrecognized job type `{}`", other))), + } +} - match &*args().nth(1).unwrap_or_default() { - "update_downloads" => tasks::update_downloads() - .enqueue(&conn) - .map_err(|e| CargoError::from_std_error(e))?, - other => panic!("Unrecognized job type `{}`", other), - }; - - Ok(()) +/// Helper to map the `PerformError` returned by `swirl::Job::enqueue()` to a +/// `CargoError`. Can be removed once `map_err()` isn't needed any more. +trait Enqueue: swirl::Job { + fn enqueue(self, conn: &PgConnection) -> CargoResult<()> { + ::enqueue(self, conn).map_err(|e| CargoError::from_std_error(e)) + } } + +impl Enqueue for J {} diff --git a/src/tasks.rs b/src/tasks.rs index 930f83bfc11..ed9e0e91449 100644 --- a/src/tasks.rs +++ b/src/tasks.rs @@ -1,3 +1,5 @@ +pub mod dump_db; mod update_downloads; +pub use dump_db::dump_db; pub use update_downloads::update_downloads; diff --git a/src/tasks/dump_db.rs b/src/tasks/dump_db.rs new file mode 100644 index 00000000000..b6c36a3a966 --- /dev/null +++ b/src/tasks/dump_db.rs @@ -0,0 +1,172 @@ +use std::{ + fs::File, + path::{Path, PathBuf}, +}; + +use crate::{background_jobs::Environment, uploaders::Uploader, util::errors::std_error_no_send}; + +use swirl::PerformError; + +/// Create CSV dumps of the public information in the database, wrap them in a +/// tarball and upload to S3. +#[swirl::background_job] +pub fn dump_db( + env: &Environment, + database_url: String, + target_name: String, +) -> Result<(), PerformError> { + let directory = DumpDirectory::create()?; + directory.populate(&database_url)?; + let tarball = DumpTarball::create(&directory.export_dir)?; + tarball.upload(&target_name, &env.uploader)?; + println!("Database dump uploaded to {}.", &target_name); + Ok(()) +} + +/// Manage the export directory. +/// +/// Create the directory, populate it with the psql scripts and CSV dumps, and +/// make sure it gets deleted again even in the case of an error. +#[derive(Debug)] +pub struct DumpDirectory { + pub timestamp: chrono::DateTime, + pub export_dir: PathBuf, +} + +impl DumpDirectory { + pub fn create() -> Result { + let timestamp = chrono::Utc::now(); + let timestamp_str = timestamp.format("%Y-%m-%d-%H%M%S").to_string(); + let export_dir = std::env::temp_dir().join("dump-db").join(timestamp_str); + std::fs::create_dir_all(&export_dir)?; + Ok(Self { + timestamp, + export_dir, + }) + } + + pub fn populate(&self, database_url: &str) -> Result<(), PerformError> { + self.add_readme()?; + self.add_metadata()?; + self.dump_schema(database_url)?; + self.dump_db(database_url) + } + + fn add_readme(&self) -> Result<(), PerformError> { + use std::io::Write; + + let mut readme = File::create(self.export_dir.join("README.md"))?; + readme.write_all(include_bytes!("dump_db/readme_for_tarball.md"))?; + Ok(()) + } + + fn add_metadata(&self) -> Result<(), PerformError> { + #[derive(Serialize)] + struct Metadata<'a> { + timestamp: &'a chrono::DateTime, + crates_io_commit: String, + } + let metadata = Metadata { + timestamp: &self.timestamp, + crates_io_commit: dotenv::var("HEROKU_SLUG_COMMIT") + .unwrap_or_else(|_| "unknown".to_owned()), + }; + let file = File::create(self.export_dir.join("metadata.json"))?; + serde_json::to_writer_pretty(file, &metadata)?; + Ok(()) + } + + pub fn dump_schema(&self, database_url: &str) -> Result<(), PerformError> { + let schema_sql = File::create(self.export_dir.join("schema.sql"))?; + let status = std::process::Command::new("pg_dump") + .arg("--schema-only") + .arg("--no-owner") + .arg("--no-acl") + .arg(database_url) + .stdout(schema_sql) + .spawn()? + .wait()?; + if !status.success() { + return Err("pg_dump did not finish successfully.".into()); + } + Ok(()) + } + + pub fn dump_db(&self, database_url: &str) -> Result<(), PerformError> { + let export_script = self.export_dir.join("export.sql"); + let import_script = self.export_dir.join("import.sql"); + gen_scripts::gen_scripts(&export_script, &import_script)?; + std::fs::create_dir(self.export_dir.join("data"))?; + run_psql(&export_script, database_url) + } +} + +impl Drop for DumpDirectory { + fn drop(&mut self) { + std::fs::remove_dir_all(&self.export_dir).unwrap(); + } +} + +pub fn run_psql(script: &Path, database_url: &str) -> Result<(), PerformError> { + let psql_script = File::open(&script)?; + let psql = std::process::Command::new("psql") + .arg(database_url) + .current_dir(script.parent().unwrap()) + .stdin(psql_script) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + let output = psql.wait_with_output()?; + let stderr = String::from_utf8_lossy(&output.stderr); + if stderr.contains("ERROR") { + return Err(format!("Error while executing psql: {}", stderr).into()); + } + if !output.status.success() { + return Err("psql did not finish successfully.".into()); + } + Ok(()) +} + +/// Manage the tarball of the database dump. +/// +/// Create the tarball, upload it to S3, and make sure it gets deleted. +struct DumpTarball { + tarball_path: PathBuf, +} + +impl DumpTarball { + fn create(export_dir: &Path) -> Result { + let tarball_path = export_dir.with_extension("tar.gz"); + let tarfile = File::create(&tarball_path)?; + let result = Self { tarball_path }; + let encoder = flate2::write::GzEncoder::new(tarfile, flate2::Compression::default()); + let mut archive = tar::Builder::new(encoder); + archive.append_dir_all(export_dir.file_name().unwrap(), &export_dir)?; + Ok(result) + } + + fn upload(&self, target_name: &str, uploader: &Uploader) -> Result<(), PerformError> { + let client = reqwest::Client::new(); + let tarfile = File::open(&self.tarball_path)?; + let content_length = tarfile.metadata()?.len(); + // TODO Figure out the correct content type. + uploader + .upload( + &client, + target_name, + tarfile, + content_length, + "application/gzip", + ) + .map_err(std_error_no_send)?; + Ok(()) + } +} + +impl Drop for DumpTarball { + fn drop(&mut self) { + std::fs::remove_file(&self.tarball_path).unwrap(); + } +} + +mod gen_scripts; diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml new file mode 100644 index 00000000000..38318037c95 --- /dev/null +++ b/src/tasks/dump_db/dump-db.toml @@ -0,0 +1,211 @@ +# This file configures what to include in public database dumps. For each +# database table, we set which columns are included in the dump, and optionally +# how to filter the rows. +# +# .columns - a TOML dictionary determining what columns to include. +# possible values are "private" (not included) and "public" (included). +# +# .filter - a string that is a valid SQL expression, which is used +# in a WHERE clause to filter the rows of the table. +# +# .dependencies - an array of table names, used to determine the +# order of the tables in the generated import script. All tables referred +# to by public columns in the current table should be listed, to make sure +# they are imported before this table. +# +# .columns_defaults - a TOML dictionary mapping column names to a +# raw SQL expression that is used as the default value for the column on +# import. This is useful for private columns that are not nullable and do +# not have a default. + +[api_tokens.columns] +id = "private" +user_id = "private" +token = "private" +name = "private" +created_at = "private" +last_used_at = "private" +revoked = "private" + +[background_jobs.columns] +id = "private" +job_type = "private" +data = "private" +retries = "private" +last_retry = "private" +created_at = "private" + +[badges] +dependencies = ["crates"] +[badges.columns] +crate_id = "public" +badge_type = "public" +attributes = "public" + +[categories.columns] +id = "public" +category = "public" +slug = "public" +description = "public" +crates_cnt = "public" +created_at = "public" +path = "public" + +[crate_owner_invitations.columns] +invited_user_id = "private" +invited_by_user_id = "private" +crate_id = "private" +created_at = "private" + +[crate_owners] +dependencies = ["crates", "users"] +filter = "NOT deleted" +[crate_owners.columns] +crate_id = "public" +owner_id = "public" +created_at = "public" +created_by = "private" +deleted = "private" +updated_at = "public" +owner_kind = "public" + +[crates.columns] +id = "public" +name = "public" +updated_at = "public" +created_at = "public" +downloads = "public" +description = "public" +homepage = "public" +documentation = "public" +readme = "public" +textsearchable_index_col = "public" +repository = "public" +max_upload_size = "public" + +[crates_categories] +dependencies = ["categories", "crates"] +[crates_categories.columns] +crate_id = "public" +category_id = "public" + +[crates_keywords] +dependencies = ["crates", "keywords"] +[crates_keywords.columns] +crate_id = "public" +keyword_id = "public" + +[dependencies] +dependencies = ["crates", "versions"] +[dependencies.columns] +id = "public" +version_id = "public" +crate_id = "public" +req = "public" +optional = "public" +default_features = "public" +features = "public" +target = "public" +kind = "public" + +[__diesel_schema_migrations.columns] +version = "private" +run_on = "private" + +[emails.columns] +id = "private" +user_id = "private" +email = "private" +verified = "private" +token = "private" +token_generated_at = "private" + +[follows.columns] +user_id = "private" +crate_id = "private" + +[keywords.columns] +id = "public" +keyword = "public" +crates_cnt = "public" +created_at = "public" + +[metadata.columns] +total_downloads = "public" + +[publish_limit_buckets.columns] +user_id = "private" +tokens = "private" +last_refill = "private" + +[publish_rate_overrides.columns] +user_id = "private" +burst = "private" + +[readme_renderings.columns] +version_id = "private" +rendered_at = "private" + +[reserved_crate_names.columns] +name = "public" + +[teams.columns] +id = "public" +login = "public" +github_id = "public" +name = "public" +avatar = "public" + +[users] +filter = """ +id in ( + SELECT owner_id AS user_id FROM crate_owners WHERE NOT deleted AND owner_kind = 0 + UNION + SELECT published_by as user_id FROM versions +)""" +[users.columns] +id = "public" +email = "private" +gh_access_token = "private" +gh_login = "public" +name = "public" +gh_avatar = "public" +gh_id = "public" +[users.column_defaults] +gh_access_token = "''" + +[version_authors] +dependencies = ["versions"] +[version_authors.columns] +id = "public" +version_id = "public" +user_id = "private" +name = "public" + +[version_downloads] +dependencies = ["versions"] +[version_downloads.columns] +version_id = "public" +downloads = "public" +counted = "private" +date = "public" +processed = "private" + +[versions] +dependencies = ["crates", "users"] +[versions.columns] +id = "public" +crate_id = "public" +num = "public" +updated_at = "public" +created_at = "public" +downloads = "public" +features = "public" +yanked = "public" +license = "public" +crate_size = "public" +published_by = "public" + +[versions_published_by.columns] +version_id = "private" +email = "private" diff --git a/src/tasks/dump_db/dump-export.sql.hbs b/src/tasks/dump_db/dump-export.sql.hbs new file mode 100644 index 00000000000..0fcf38cba4d --- /dev/null +++ b/src/tasks/dump_db/dump-export.sql.hbs @@ -0,0 +1,21 @@ +BEGIN; +{{~#each tables}} +{{~#if this.filter}} + CREATE TEMPORARY VIEW "dump_db_{{this.name}}" AS ( + SELECT {{this.columns}} + FROM "{{this.name}}" + WHERE {{this.filter}} + ); +{{~/if}} +{{~/each}} +COMMIT; + +BEGIN ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; +{{~#each tables}} +{{~#if this.filter}} + \copy (SELECT * FROM "dump_db_{{this.name}}") TO 'data/{{this.name}}.csv' WITH CSV HEADER +{{~else}} + \copy "{{this.name}}" ({{this.columns}}) TO 'data/{{this.name}}.csv' WITH CSV HEADER +{{~/if}} +{{~/each}} +COMMIT; diff --git a/src/tasks/dump_db/dump-import.sql.hbs b/src/tasks/dump_db/dump-import.sql.hbs new file mode 100644 index 00000000000..49508e7e35a --- /dev/null +++ b/src/tasks/dump_db/dump-import.sql.hbs @@ -0,0 +1,25 @@ +BEGIN; + -- Set defaults for non-nullable columns not included in the dump. +{{~#each tables as |table|}} +{{~#each column_defaults}} + ALTER TABLE "{{table.name}}" ALTER COLUMN "{{@key}}" SET DEFAULT {{this}}; +{{~/each}} +{{~/each}} + + -- Truncate all tables. +{{~#each tables}} + TRUNCATE "{{this.name}}" RESTART IDENTITY CASCADE; +{{~/each}} + + -- Import the CSV data. +{{~#each tables}} + \copy "{{this.name}}" ({{this.columns}}) FROM 'data/{{this.name}}.csv' WITH CSV HEADER +{{~/each}} + + -- Drop the defaults again. +{{~#each tables as |table|}} +{{~#each column_defaults}} + ALTER TABLE "{{table.name}}" ALTER COLUMN "{{@key}}" DROP DEFAULT; +{{~/each}} +{{~/each}} +COMMIT; diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs new file mode 100644 index 00000000000..e128a0165cd --- /dev/null +++ b/src/tasks/dump_db/gen_scripts.rs @@ -0,0 +1,265 @@ +use std::{ + collections::{BTreeMap, VecDeque}, + fs::File, + path::Path, +}; + +use swirl::PerformError; + +pub fn gen_scripts(export_script: &Path, import_script: &Path) -> Result<(), PerformError> { + let config: VisibilityConfig = toml::from_str(include_str!("dump-db.toml")).unwrap(); + let export_sql = File::create(export_script)?; + let import_sql = File::create(import_script)?; + config.gen_psql_scripts(export_sql, import_sql) +} + +/// An enum indicating whether a column is included in the database dumps. +/// Public columns are included, private are not. +#[derive(Clone, Copy, Debug, Deserialize, PartialEq)] +#[serde(rename_all = "lowercase")] +enum ColumnVisibility { + Private, + Public, +} + +/// Filtering information for a single table. The `dependencies` field is only +/// used to determine the order of the tables in the generated import script, +/// and should list all tables the current tables refers to with foreign key +/// constraints on public columns. The `filter` field is a valid SQL expression +/// used in a `WHERE` clause to filter the rows of the table. The `columns` +/// field maps column names to their respective visibilities. +#[derive(Clone, Debug, Default, Deserialize)] +struct TableConfig { + #[serde(default)] + dependencies: Vec, + filter: Option, + columns: BTreeMap, + #[serde(default)] + column_defaults: BTreeMap, +} + +/// Subset of the configuration data to be passed on to the Handlbars template. +#[derive(Debug, Serialize)] +struct HandlebarsTableContext<'a> { + name: &'a str, + filter: Option<&'a str>, + columns: String, + column_defaults: BTreeMap<&'a str, &'a str>, +} + +impl TableConfig { + fn handlebars_context<'a>(&'a self, name: &'a str) -> Option> { + let columns = self + .columns + .iter() + .filter(|&(_, &vis)| vis == ColumnVisibility::Public) + .map(|(col, _)| format!("\"{}\"", col)) + .collect::>() + .join(", "); + if columns.is_empty() { + None + } else { + let filter = self.filter.as_ref().map(String::as_str); + let column_defaults = self + .column_defaults + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())) + .collect(); + Some(HandlebarsTableContext { + name, + filter, + columns, + column_defaults, + }) + } + } +} + +/// Maps table names to the respective configurations. Used to load `dump_db.toml`. +#[derive(Clone, Debug, Default, Deserialize)] +#[serde(transparent)] +struct VisibilityConfig(BTreeMap); + +/// Subset of the configuration data to be passed on to the Handlbars template. +#[derive(Debug, Serialize)] +struct HandlebarsContext<'a> { + tables: Vec>, +} + +impl VisibilityConfig { + /// Sort the tables in a way that dependencies come before dependent tables. + /// + /// Returns a vector of table names. + fn topological_sort(&self) -> Vec<&str> { + let mut result = Vec::new(); + let mut num_deps = BTreeMap::new(); + let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new(); + for (table, config) in self.0.iter() { + num_deps.insert(table.as_str(), config.dependencies.len()); + for dep in &config.dependencies { + rev_deps + .entry(dep.as_str()) + .or_default() + .push(table.as_str()); + } + } + let mut ready: VecDeque<&str> = num_deps + .iter() + .filter(|(_, &count)| count == 0) + .map(|(&table, _)| table) + .collect(); + while let Some(table) = ready.pop_front() { + result.push(table); + for dep in rev_deps.get(table).iter().cloned().flatten() { + *num_deps.get_mut(dep).unwrap() -= 1; + if num_deps[dep] == 0 { + ready.push_back(dep); + } + } + } + assert_eq!( + self.0.len(), + result.len(), + "circular dependencies in database dump configuration detected", + ); + result + } + + fn handlebars_context(&self) -> HandlebarsContext<'_> { + let tables = self + .topological_sort() + .into_iter() + .filter_map(|table| self.0[table].handlebars_context(table)) + .collect(); + HandlebarsContext { tables } + } + + fn gen_psql_scripts(&self, export_sql: W, import_sql: W) -> Result<(), PerformError> + where + W: std::io::Write, + { + let context = self.handlebars_context(); + let mut handlebars = handlebars::Handlebars::new(); + handlebars.register_escape_fn(handlebars::no_escape); + handlebars.render_template_to_write( + include_str!("dump-export.sql.hbs"), + &context, + export_sql, + )?; + handlebars.render_template_to_write( + include_str!("dump-import.sql.hbs"), + &context, + import_sql, + )?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::pg_connection; + use diesel::prelude::*; + use std::collections::HashSet; + use std::iter::FromIterator; + + /// Test whether the visibility configuration matches the schema of the + /// test database. + #[test] + fn check_visibility_config() { + let conn = pg_connection(); + let db_columns = HashSet::::from_iter(get_db_columns(&conn)); + let vis_columns = toml::from_str::(include_str!("dump-db.toml")) + .unwrap() + .0 + .iter() + .flat_map(|(table, config)| { + config.columns.iter().map(move |(column, _)| Column { + table_name: table.clone(), + column_name: column.clone(), + }) + }) + .collect(); + let mut errors = vec![]; + for Column { + table_name, + column_name, + } in db_columns.difference(&vis_columns) + { + errors.push(format!( + "No visibility information for columns {}.{}.", + table_name, column_name + )); + } + for Column { + table_name, + column_name, + } in vis_columns.difference(&db_columns) + { + errors.push(format!( + "Column {}.{} does not exist in the database.", + table_name, column_name + )); + } + assert!( + errors.is_empty(), + "The visibility configuration does not match the database schema:\n{}", + errors.join("\n"), + ); + } + + mod information_schema { + table! { + information_schema.columns (table_schema, table_name, column_name) { + table_schema -> Text, + table_name -> Text, + column_name -> Text, + ordinal_position -> Integer, + } + } + } + + #[derive(Debug, Eq, Hash, PartialEq, Queryable)] + struct Column { + table_name: String, + column_name: String, + } + + fn get_db_columns(conn: &PgConnection) -> Vec { + use information_schema::columns::dsl::*; + columns + .select((table_name, column_name)) + .filter(table_schema.eq("public")) + .order_by((table_name, ordinal_position)) + .load(conn) + .unwrap() + } + + fn table_config_with_deps(deps: &[&str]) -> TableConfig { + TableConfig { + dependencies: deps.iter().cloned().map(ToOwned::to_owned).collect(), + ..Default::default() + } + } + + #[test] + fn test_topological_sort() { + let mut config = VisibilityConfig::default(); + let tables = &mut config.0; + tables.insert("a".to_owned(), table_config_with_deps(&["b", "c"])); + tables.insert("b".to_owned(), table_config_with_deps(&["c", "d"])); + tables.insert("c".to_owned(), table_config_with_deps(&["d"])); + config.0.insert("d".to_owned(), table_config_with_deps(&[])); + assert_eq!(config.topological_sort(), ["d", "c", "b", "a"]); + } + + #[test] + #[should_panic] + fn topological_sort_panics_for_cyclic_dependency() { + let mut config = VisibilityConfig::default(); + let tables = &mut config.0; + tables.insert("a".to_owned(), table_config_with_deps(&["b"])); + tables.insert("b".to_owned(), table_config_with_deps(&["a"])); + config.topological_sort(); + } +} diff --git a/src/tasks/dump_db/readme_for_tarball.md b/src/tasks/dump_db/readme_for_tarball.md new file mode 100644 index 00000000000..3d9f431d311 --- /dev/null +++ b/src/tasks/dump_db/readme_for_tarball.md @@ -0,0 +1,30 @@ +# crates.io Database Dump + +This is a dump of the public information in the crates.io database. + +## Files + +* `data/` – the CSV files with the actual data. +* `export.sql` – the `psql` script that was used to create this database dump. It is only included in the archive for reference. +* `import.sql` – a `psql` script that can be used to restore the dump into a PostgreSQL database with the same schema as the `crates.io` database, destroying all current data. +* `metadata.json` – some metadata of this dump. +* `schema.sql` – a dump of the database schema to facilitate generating a new database from the data. + +## Metadata Fields + +* `timestamp` – the UTC time the dump was started. +* `crates_io_commit` – the git commit hash of the deployed version of crates.io that created this dump. + +## Restoring to a Local crates.io Database + +1. Create a new database. + + createdb DATABASE_NAME + +2. Restore the database schema. + + psql DATABASE_NAME < schema.sql + +3. Run the import script. + + psql DATABASE_URL < import.sql diff --git a/src/tests/all.rs b/src/tests/all.rs index 310cdf5db9c..af9108f48bd 100644 --- a/src/tests/all.rs +++ b/src/tests/all.rs @@ -52,6 +52,7 @@ mod badge; mod builders; mod categories; mod category; +mod dump_db; mod git; mod keyword; mod krate; diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs new file mode 100644 index 00000000000..11187d6b16f --- /dev/null +++ b/src/tests/dump_db.rs @@ -0,0 +1,71 @@ +use cargo_registry::tasks::dump_db; +use diesel::{ + connection::{Connection, SimpleConnection}, + pg::PgConnection, +}; + +#[test] +fn dump_db_and_reimport_dump() { + let database_url = crate::env("TEST_DATABASE_URL"); + + // TODO prefill database with some data + + let directory = dump_db::DumpDirectory::create().unwrap(); + directory.populate(&database_url).unwrap(); + + let schema = TemporarySchema::create(database_url, "test_db_dump"); + schema.run_migrations(); + + let import_script = directory.export_dir.join("import.sql"); + dump_db::run_psql(&import_script, &schema.database_url).unwrap(); + + // TODO: Consistency checks on the re-imported data? +} + +struct TemporarySchema { + pub database_url: String, + pub schema_name: String, + pub connection: PgConnection, +} + +impl TemporarySchema { + pub fn create(database_url: String, schema_name: &str) -> Self { + let params = &[("options", format!("--search_path={},public", schema_name))]; + let database_url = url::Url::parse_with_params(&database_url, params) + .unwrap() + .into_string(); + let schema_name = schema_name.to_owned(); + let connection = PgConnection::establish(&database_url).unwrap(); + connection + .batch_execute(&format!( + r#"DROP SCHEMA IF EXISTS "{schema_name}" CASCADE; + CREATE SCHEMA "{schema_name}";"#, + schema_name = schema_name, + )) + .unwrap(); + Self { + database_url, + schema_name, + connection, + } + } + + pub fn run_migrations(&self) { + use diesel_migrations::{find_migrations_directory, run_pending_migrations_in_directory}; + let migrations_dir = find_migrations_directory().unwrap(); + run_pending_migrations_in_directory( + &self.connection, + &migrations_dir, + &mut std::io::sink(), + ) + .unwrap(); + } +} + +impl Drop for TemporarySchema { + fn drop(&mut self) { + self.connection + .batch_execute(&format!(r#"DROP SCHEMA "{}" CASCADE;"#, self.schema_name)) + .unwrap(); + } +}