dimitri · dimitri · May 29, 2024 · May 15, 2024 · May 29, 2024 · May 29, 2024
diff --git a/README.md b/README.md
@@ -289,9 +289,8 @@ implementing any step on its own.
   6. `pgcopydb copy sequences`
   7. `pgcopydb copy indexes`
   8. `pgcopydb copy constraints`
-  9. `pgcopydb vacuumdb`
- 10. `pgcopydb restore post-data`
- 11. `kill %1`
+  9. `pgcopydb restore post-data`
+ 10. `kill %1`
 
 Using individual commands fails to provide the advanced concurrency
 capabilities of the main `pgcopydb clone` command, so it is strongly

diff --git a/docs/include/clone.rst b/docs/include/clone.rst
@@ -11,6 +11,7 @@
      --restore-jobs                Number of concurrent jobs for pg_restore
      --large-objects-jobs          Number of concurrent Large Objects jobs to run
      --split-tables-larger-than    Same-table concurrency size threshold
+     --estimate-table-sizes        Allow using estimates for relation sizes
      --drop-if-exists              On the target database, clean-up from a previous run first
      --roles                       Also copy roles found on source to target
      --no-role-passwords           Do not dump passwords for roles

diff --git a/docs/include/list-table-parts.rst b/docs/include/list-table-parts.rst
@@ -9,4 +9,5 @@
      --table-name                Name of the target table
      --split-tables-larger-than  Size threshold to consider partitioning
      --skip-split-by-ctid        Skip the ctid split
+     --estimate-table-sizes      Allow using estimates for relation sizes
 
diff --git a/docs/include/list-tables.rst b/docs/include/list-tables.rst
@@ -6,8 +6,6 @@
      --source            Postgres URI to the source database
      --filter <filename> Use the filters defined in <filename>
      --force             Force fetching catalogs again
-     --cache             Cache table size in relation pgcopydb.pgcopydb_table_size
-     --drop-cache        Drop relation pgcopydb.pgcopydb_table_size
      --list-skipped      List only tables that are setup to be skipped
      --without-pkey      List only tables that have no primary key
 
diff --git a/docs/ref/pgcopydb_clone.rst b/docs/ref/pgcopydb_clone.rst
@@ -396,6 +396,16 @@ The following options are available to ``pgcopydb clone``:
    This environment variable value is expected to be a byte size, and bytes
    units B, kB, MB, GB, TB, PB, and EB are known.
 
+--estimate-table-sizes
+
+   Use estimates on table sizes to decide how to split tables when using
+   :ref:`same_table_concurrency`.
+
+   When this option is used, we run `vacuumdb --analyze-only --jobs=<table-jobs>`
+   command on the source database that updates the statistics for the number of
+   pages for each relation. Later, we use the number of pages, and the size for
+   each page to estimate the actual size of the tables.
+
 --drop-if-exists
 
   When restoring the schema on the target Postgres instance, ``pgcopydb``
@@ -510,9 +520,9 @@ The following options are available to ``pgcopydb clone``:
 --skip-split-by-ctid
 
   Skip splitting tables based on CTID during the copy operation. By default,
-  pgcopydb splits large tables into smaller chunks based on the CTID column 
-  if there isn't a unique integer column in the table. However, in some cases 
-  you may want to skip this splitting process if the CTID range scan is slow 
+  pgcopydb splits large tables into smaller chunks based on the CTID column
+  if there isn't a unique integer column in the table. However, in some cases
+  you may want to skip this splitting process if the CTID range scan is slow
   in the underlying system.
 
 --filters <filename>
@@ -729,6 +739,21 @@ PGCOPYDB_SPLIT_TABLES_LARGER_THAN
    When ``--split-tables-larger-than`` is ommitted from the command line,
    then this environment variable is used.
 
+PGCOPYDB_ESTIMATE_TABLE_SIZES
+
+   When true (or *yes*, or *on*, or 1, same input as a Postgres boolean)
+   then pgcopydb estimates the size of tables to determine whether or not to
+   split tables. This option is only useful when querying the relation sizes on
+   source database is costly.
+
+   When ``--estimate-table-sizes`` is ommitted from the command line, then
+   this environment variable is used.
+
+   When this option is used, we run `vacuumdb --analyze-only --jobs=<table-jobs>`
+   command on the source database that updates the statistics for the number of
+   pages for each relation. Later, we use the number of pages, and the size for
+   each page to estimate the actual size of the tables.
+
 PGCOPYDB_OUTPUT_PLUGIN
 
    Logical decoding output plugin to use. When ``--plugin`` is omitted from the

diff --git a/docs/ref/pgcopydb_copy.rst b/docs/ref/pgcopydb_copy.rst
@@ -421,6 +421,16 @@ PGCOPYDB_SPLIT_TABLES_LARGER_THAN
    When ``--split-tables-larger-than`` is ommitted from the command line,
    then this environment variable is used.
 
+PGCOPYDB_ESTIMATE_TABLE_SIZES
+
+   When true (or *yes*, or *on*, or 1, same input as a Postgres boolean)
+   then pgcopydb estimates the size of tables to determine whether or not to
+   split tables. This option is only useful when querying the relation sizes on
+   source database is costly.
+
+   When ``--estimate-table-sizes`` is ommitted from the command line, then
+   this environment variable is used.
+
 PGCOPYDB_DROP_IF_EXISTS
 
    When true (or *yes*, or *on*, or 1, same input as a Postgres boolean)

diff --git a/docs/ref/pgcopydb_list.rst b/docs/ref/pgcopydb_list.rst
@@ -70,13 +70,6 @@ tables to COPY the data from.
 
 .. include:: ../include/list-tables.rst
 
-The ``--cache`` option allows caching the `pg_table_size()`__ result in the
-newly created table ``pgcopydb.pgcopydb_table_size``. This is only useful in
-Postgres deployments where this computation is quite slow, and when the
-pgcopydb operation is going to be run multiple times.
-
-__ https://www.postgresql.org/docs/15/functions-admin.html#FUNCTIONS-ADMIN-DBSIZE
-
 .. _pgcopydb_list_table_parts:
 
 pgcopydb list table-parts

diff --git a/src/bin/pgcopydb/cli_clone_follow.c b/src/bin/pgcopydb/cli_clone_follow.c
@@ -34,6 +34,7 @@
 	"  --restore-jobs                Number of concurrent jobs for pg_restore\n" \
 	"  --large-objects-jobs          Number of concurrent Large Objects jobs to run\n" \
 	"  --split-tables-larger-than    Same-table concurrency size threshold\n" \
+	"  --estimate-table-sizes        Allow using estimates for relation sizes\n" \
 	"  --drop-if-exists              On the target database, clean-up from a previous run first\n" \
 	"  --roles                       Also copy roles found on source to target\n" \
 	"  --no-role-passwords           Do not dump passwords for roles\n" \

diff --git a/src/bin/pgcopydb/cli_common.c b/src/bin/pgcopydb/cli_common.c
@@ -322,6 +322,32 @@ cli_copydb_getenv(CopyDBOptions *options)
 		++errors;
 	}
 
+	/* when --estimate-table-sizes has not been used, check PGCOPYDB_ESTIMATE_TABLE_SIZES */
+	if (!options->estimateTableSizes)
+	{
+		if (env_exists(PGCOPYDB_ESTIMATE_TABLE_SIZES))
+		{
+			char estimateTableSizesAsString[BUFSIZE] = { 0 };
+
+			if (!get_env_copy(PGCOPYDB_ESTIMATE_TABLE_SIZES,
+							  estimateTableSizesAsString,
+							  sizeof(estimateTableSizesAsString)))
+			{
+				/* errors have already been logged */
+				++errors;
+			}
+			else if (!parse_bool(estimateTableSizesAsString,
+								 &(options->estimateTableSizes)))
+			{
+				log_error("Failed to parse environment variable \"%s\" "
+						  "value \"%s\", expected a boolean (on/off)",
+						  PGCOPYDB_ESTIMATE_TABLE_SIZES,
+						  estimateTableSizesAsString);
+				++errors;
+			}
+		}
+	}
+
 	/* when --snapshot has not been used, check PGCOPYDB_SNAPSHOT */
 	if (env_exists(PGCOPYDB_SNAPSHOT))
 	{
@@ -825,6 +851,7 @@ cli_copy_db_getopts(int argc, char **argv)
 		{ "large-objects-jobs", required_argument, NULL, 'b' },
 		{ "split-tables-larger-than", required_argument, NULL, 'L' },
 		{ "split-at", required_argument, NULL, 'L' },
+		{ "estimate-table-sizes", no_argument, NULL, 'm' },
 		{ "drop-if-exists", no_argument, NULL, 'c' }, /* pg_restore -c */
 		{ "roles", no_argument, NULL, 'A' },          /* p
10000
g_dumpall --roles-only */
 		{ "no-role-passwords", no_argument, NULL, 'P' },
@@ -884,7 +911,7 @@ cli_copy_db_getopts(int argc, char **argv)
 	}
 
 	const char *optstring =
-		"S:T:D:J:I:b:L:cAPOXj:xBeMlUgkyF:F:Q:irRCN:fp:ws:o:tE:Vvdzqh";
+		"S:T:D:J:I:b:L:mcAPOXj:xBeMlUgkyF:F:Q:irRCN:fp:ws:o:tE:Vvdzqh";
 
 	while ((c = getopt_long(argc, argv,
 							optstring, long_options, &option_index)) != -1)
@@ -983,6 +1010,13 @@ cli_copy_db_getopts(int argc, char **argv)
 				break;
 			}
 
+			case 'm':
+			{
+				options.estimateTableSizes = true;
+				log_trace("--estimate-table-sizes");
+				break;
+			}
+
 			case 'c':
 			{
 				options.restoreOptions.dropIfExists = true;

diff --git a/src/bin/pgcopydb/cli_common.h b/src/bin/pgcopydb/cli_common.h
@@ -1,5 +1,5 @@
 /*
- * src/bin/pg_autoctl/cli_common.h
+ * src/bin/pgcopydb/cli_common.h
  *     Implementation of a CLI which lets you run individual keeper routines
  *     directly
  *
@@ -39,6 +39,7 @@ typedef struct CopyDBOptions
 	int lObjectJobs;
 
 	SplitTableLargerThan splitTablesLargerThan;
+	bool estimateTableSizes;
 
 	RestoreOptions restoreOptions;
 

diff --git a/src/bin/pgcopydb/cli_compare.c b/src/bin/pgcopydb/cli_compare.c
 		exit(EXIT_CODE_BAD_ARGS);
 	}
 
-	/* bypass computing partitionning specs */
-	options.splitTablesLargerThan.bytes = 0;
+	/* bypass computing partitioning specs */
+	SplitTableLargerThan empty = { 0 };
+	options.splitTablesLargerThan = empty;
 
 	while ((c = getopt_long(argc, argv, "S:T:D:j:JVvdzqh",
 							long_options, &option_index)) != -1)

diff --git a/src/bin/pgcopydb/cli_list.c b/src/bin/pgcopydb/cli_list.c
@@ -96,8 +96,6 @@ static CommandLine list_tables_command =
 		"  --source            Postgres URI to the source database\n"
 		"  --filter <filename> Use the filters defined in <filename>\n"
 		"  --force             Force fetching catalogs again\n"
-		"  --cache             Cache table size in relation pgcopydb.pgcopydb_table_size\n"
-		"  --drop-cache        Drop relation pgcopydb.pgcopydb_table_size\n"
 		"  --list-skipped      List only tables that are setup to be skipped\n"
 		"  --without-pkey      List only tables that have no primary key\n",
 		cli_list_db_getopts,
@@ -113,7 +111,8 @@ static CommandLine list_table_parts_command =
 		"  --schema-name               Name of the schema where to find the table\n"
 		"  --table-name                Name of the target table\n"
 		"  --split-tables-larger-than  Size threshold to consider partitioning\n"
-		"  --skip-split-by-ctid        Skip the ctid split\n",
+		"  --skip-split-by-ctid        Skip the ctid split\n"
+		"  --estimate-table-sizes      Allow using estimates for relation sizes\n",
 		cli_list_db_getopts,
 		cli_list_table_parts);
 
@@ -222,6 +221,32 @@ cli_list_getenv(ListDBOptions *options)
 		++errors;
 	}
 
+	/* when --estimate-table-sizes has not been used, check PGCOPYDB_ESTIMATE_TABLE_SIZES */
+	if (!options->estimateTableSizes)
+	{
+		if (env_exists(PGCOPYDB_ESTIMATE_TABLE_SIZES))
+		{
+			char estimateTableSizesAsString[BUFSIZE] = { 0 };
+
+			if (!get_env_copy(PGCOPYDB_ESTIMATE_TABLE_SIZES,
+							  estimateTableSizesAsString,
+							  sizeof(estimateTableSizesAsString)))
+			{
+				/* errors have already been logged */
+				++errors;
+			}
+			else if (!parse_bool(estimateTableSizesAsString,
+								 &(options->estimateTableSizes)))
+			{
+				log_error("Failed to parse environment variable \"%s\" "
+						  "value \"%s\", expected a boolean (on/off)",
+						  PGCOPYDB_ESTIMATE_TABLE_SIZES,
+						  estimateTableSizesAsString);
+				++errors;
+			}
+		}
+	}
+
 	return errors == 0;
 }
 
@@ -247,10 +272,9 @@ cli_list_db_getopts(int argc, char **argv)
 		{ "without-pkey", no_argument, NULL, 'P' },
 		{ "split-tables-larger-than", required_argument, NULL, 'L' },
 		{ "split-at", required_argument, NULL, 'L' },
+		{ "estimate-table-sizes", no_argument, NULL, 'm' },
 		{ "skip-split-by-ctid", no_argument, NULL, 'k' },
 		{ "force", no_argument, NULL, 'f' },
-		{ "cache", no_argument, NULL, 'c' },
-		{ "drop-cache", no_argument, NULL, 'C' },
 		{ "summary", no_argument, NULL, 'y' },
 		{ "available-versions", no_argument, NULL, 'a' },
 		{ "requirements", no_argument, NULL, 'r' },
@@ -277,7 +301,9 @@ cli_list_db_getopts(int argc, char **argv)
 		exit(EXIT_CODE_BAD_ARGS);
 	}
 
-	while ((c = getopt_long(argc, argv, "S:D:s:t:F:xPLk:fcCyarJRIN:Vdzvqh",
+	const char *optstring = "S:D:s:t:F:xPLk:mfyarJRIN:Vdzvqh";
+
+	while ((c = getopt_long(argc, argv, optstring,
 							long_options, &option_index)) != -1)
 	{
 		switch (c)
@@ -377,29 +403,10 @@ cli_list_db_getopts(int argc, char **argv)
 				break;
 			}
 
-			case 'c':
+			case 'm':
 			{
-				if (options.dropCache)
-				{
-					log_fatal("Please choose either --cache or --drop-cache");
-					++errors;
-				}
-
-				options.cache = true;
-				log_trace("--cache");
-				break;
-			}
-
-			case 'C':
-			{
-				if (options.cache)
-				{
-					log_fatal("Please choose either --cache or --drop-cache");
-					++errors;
-				}
-
-				options.dropCache = true;
-				log_trace("--drop-cache");
+				options.estimateTableSizes = true;
+				log_trace("--estimate-table-sizes");
 				break;
 			}
 
@@ -2033,6 +2040,7 @@ copydb_init_specs_from_listdboptions(CopyDataSpec *copySpecs,
 	options.connStrings = listDBoptions->connStrings;
 	options.splitTablesLargerThan = listDBoptions->splitTablesLargerThan;
 	options.skipCtidSplit = listDBoptions->skipCtidSplit;
+	options.estimateTableSizes = listDBoptions->estimateTableSizes;
 
 	/* process the --resume --not-consistent --snapshot options now */
 	options.resume = listDBoptions->resume;

diff --git a/src/bin/pgcopydb/cli_list.h b/src/bin/pgcopydb/cli_list.h
@@ -29,8 +29,6 @@ typedef struct ListDBOptions
 	bool listSkipped;
 	bool noPKey;
 	bool force;
-	bool cache;
-	bool dropCache;
 	bool summary;
 	bool availableVersions;
 	bool requirements;
@@ -41,6 +39,7 @@ typedef struct ListDBOptions
 	char snapshot[BUFSIZE];
 
 	SplitTableLargerThan splitTablesLargerThan;
+	bool estimateTableSizes;
 } ListDBOptions;
 
 

diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
@@ -536,6 +536,7 @@ copydb_init_specs(CopyDataSpec *specs,
 		.vacuumJobs = options->tableJobs,
 
 		.splitTablesLargerThan = options->splitTablesLargerThan,
+		.estimateTableSizes = options->estimateTableSizes,
 
 		.vacuumQueue = { 0 },
 		.indexQueue = { 0 },

diff --git a/src/bin/pgcopydb/copydb.h b/src/bin/pgcopydb/copydb.h
@@ -244,6 +244,7 @@ typedef struct CopyDataSpec
 	int lObjectJobs;
 
 	SplitTableLargerThan splitTablesLargerThan;
+	bool estimateTableSizes;
 
 	Queue copyQueue;
 	Queue indexQueue;