c667ef
From b4d103c6dec2aa0f8461e1ca78ad23d692e68d36 Mon Sep 17 00:00:00 2001
c667ef
From: Matthew Almond <malmond@fb.com>
c667ef
Date: Thu, 20 May 2021 13:35:13 -0700
c667ef
Subject: [PATCH 1/3] Add option --skip-filelists
c667ef
c667ef
This is a site-local optimization. Some packages and repos include an
c667ef
enormous number of files. This is extremely expensive if said repo is
c667ef
also fast changing.
c667ef
c667ef
Impact of skipping filelists: breaking ability to resolve file/path
c667ef
based dependencies, `-f` (file ownership) and `-l` (list) options in
c667ef
repoquery.
c667ef
---
c667ef
 doc/createrepo_c.8  | 3 +++
c667ef
 src/cmd_parser.c    | 2 ++
c667ef
 src/cmd_parser.h    | 1 +
c667ef
 src/createrepo_c.c  | 1 +
c667ef
 src/dumper_thread.c | 5 +++++
c667ef
 src/dumper_thread.h | 1 +
c667ef
 src/parsehdr.c      | 3 ++-
c667ef
 src/parsehdr.h      | 1 +
c667ef
 8 files changed, 16 insertions(+), 1 deletion(-)
c667ef
c667ef
diff --git a/doc/createrepo_c.8 b/doc/createrepo_c.8
c667ef
index e9b3fc2d..10702f48 100644
c667ef
--- a/doc/createrepo_c.8
c667ef
+++ b/doc/createrepo_c.8
c667ef
@@ -213,5 +213,8 @@ Exit with retval 2 if there were any errors during processing
c667ef
 .SS \-\-ignore\-lock
c667ef
 .sp
c667ef
 Expert (risky) option: Ignore an existing .repodata/. (Remove the existing .repodata/ and create an empty new one to serve as a lock for other createrepo instances. For the repodata generation, a different temporary dir with the name in format .repodata.time.microseconds.pid/ will be used). NOTE: Use this option on your own risk! If two createrepos run simultaneously, then the state of the generated metadata is not guaranteed \- it can be inconsistent and wrong.
c667ef
+.SS \-\-skip\-filelists
c667ef
+.sp
c667ef
+Expert (risky) option: Skip filelist generation.
c667ef
 .\" Generated by docutils manpage writer.
c667ef
 .
c667ef
diff --git a/src/cmd_parser.c b/src/cmd_parser.c
c667ef
index bbefa080..0ecf7f99 100644
c667ef
--- a/src/cmd_parser.c
c667ef
+++ b/src/cmd_parser.c
c667ef
@@ -224,6 +224,8 @@ static GOptionEntry expert_entries[] =
c667ef
       "own risk! If two createrepos run simultaneously, then the state of the "
c667ef
       "generated metadata is not guaranteed - it can be inconsistent and wrong.",
c667ef
       NULL },
c667ef
+    { "skip-filelists", 0, 0, G_OPTION_ARG_NONE, &(_cmd_options.skip_filelists),
c667ef
+      "Expert (risky) option: Skip filelist generation.", NULL},
c667ef
     { NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL },
c667ef
 };
c667ef
 
c667ef
diff --git a/src/cmd_parser.h b/src/cmd_parser.h
c667ef
index 32bcf992..5eff9d9d 100644
c667ef
--- a/src/cmd_parser.h
c667ef
+++ b/src/cmd_parser.h
c667ef
@@ -57,6 +57,7 @@ struct CmdOptions {
c667ef
     char *general_compress_type;/*!< which compression type to use (even for
c667ef
                                      primary, filelists and other xml) */
c667ef
     gboolean skip_symlinks;     /*!< ignore symlinks of packages */
c667ef
+    gboolean skip_filelists;    /*!< Skip creating filelists */
c667ef
     gint changelog_limit;       /*!< number of changelog messages in
c667ef
                                      other.(xml|sqlite) */
c667ef
     gboolean unique_md_filenames;       /*!< include the file checksums in
c667ef
diff --git a/src/createrepo_c.c b/src/createrepo_c.c
c667ef
index f4f45445..9dd288e5 100644
c667ef
--- a/src/createrepo_c.c
c667ef
+++ b/src/createrepo_c.c
c667ef
@@ -1253,6 +1253,7 @@ main(int argc, char **argv)
c667ef
     user_data.checksum_type     = cmd_options->checksum_type;
c667ef
     user_data.checksum_cachedir = cmd_options->checksum_cachedir;
c667ef
     user_data.skip_symlinks     = cmd_options->skip_symlinks;
c667ef
+    user_data.skip_filelists    = cmd_options->skip_filelists;
c667ef
     user_data.repodir_name_len  = strlen(in_dir);
c667ef
     user_data.task_count        = task_count;
c667ef
     user_data.package_count     = 0;
c667ef
diff --git a/src/dumper_thread.c b/src/dumper_thread.c
c667ef
index 119f3bd8..f7c4e356 100644
c667ef
--- a/src/dumper_thread.c
c667ef
+++ b/src/dumper_thread.c
c667ef
@@ -431,6 +431,11 @@ cr_dumper_thread(gpointer data, gpointer user_data)
c667ef
     if (udata->checksum_cachedir)
c667ef
         hdrrflags = CR_HDRR_LOADHDRID | CR_HDRR_LOADSIGNATURES;
c667ef
 
c667ef
+
c667ef
+    // Load filelists, unless --skip-filelists is passed.
c667ef
+    if (udata->skip_filelists)
c667ef
+        hdrrflags |= CR_HDRR_SKIPFILES;
c667ef
+
c667ef
     // Get stat info about file
c667ef
     if (udata->old_metadata && !(udata->skip_stat)) {
c667ef
         if (stat(task->full_path, &stat_buf) == -1) {
c667ef
diff --git a/src/dumper_thread.h b/src/dumper_thread.h
c667ef
index 60f984d7..654991fc 100644
c667ef
--- a/src/dumper_thread.h
c667ef
+++ b/src/dumper_thread.h
c667ef
@@ -66,6 +66,7 @@ struct UserData {
c667ef
     cr_ChecksumType checksum_type;  // Constant representing selected checksum
c667ef
     const char *checksum_cachedir;  // Dir with cached checksums
c667ef
     gboolean skip_symlinks;         // Skip symlinks
c667ef
+    gboolean skip_filelists;        // Skip filelists
c667ef
     long task_count;                // Total number of task to process
c667ef
     long package_count;             // Total number of packages processed
c667ef
 
c667ef
diff --git a/src/parsehdr.c b/src/parsehdr.c
c667ef
index 2775bf31..97bb01e4 100644
c667ef
--- a/src/parsehdr.c
c667ef
+++ b/src/parsehdr.c
c667ef
@@ -253,7 +253,8 @@ cr_package_from_header(Header hdr,
c667ef
         assert(x == dir_count);
c667ef
     }
c667ef
 
c667ef
-    if (headerGet(hdr, RPMTAG_FILENAMES,  full_filenames,  flags) &&
c667ef
+    if (!(hdrrflags & CR_HDRR_SKIPFILES) &&
c667ef
+        headerGet(hdr, RPMTAG_FILENAMES,  full_filenames,  flags) &&
c667ef
         headerGet(hdr, RPMTAG_DIRINDEXES, indexes,  flags) &&
c667ef
         headerGet(hdr, RPMTAG_BASENAMES,  filenames, flags) &&
c667ef
         headerGet(hdr, RPMTAG_FILEFLAGS,  fileflags, flags) &&
c667ef
diff --git a/src/parsehdr.h b/src/parsehdr.h
c667ef
index 032accad..e7a4a4aa 100644
c667ef
--- a/src/parsehdr.h
c667ef
+++ b/src/parsehdr.h
c667ef
@@ -39,6 +39,7 @@ typedef enum {
c667ef
     CR_HDRR_NONE            = (1 << 0),
c667ef
     CR_HDRR_LOADHDRID       = (1 << 1), /*!< Load hdrid */
c667ef
     CR_HDRR_LOADSIGNATURES  = (1 << 2), /*!< Load siggpg and siggpg */
c667ef
+    CR_HDRR_SKIPFILES       = (1 << 3), /*!< Skip filelists */
c667ef
 } cr_HeaderReadingFlags;
c667ef
 
c667ef
 /** Read data from header and return filled cr_Package structure.
c667ef
c667ef
From da623813071274201d2bf75a7df25def03222f11 Mon Sep 17 00:00:00 2001
c667ef
From: Matthew Almond <malmond@fb.com>
c667ef
Date: Fri, 4 Jun 2021 11:48:23 -0700
c667ef
Subject: [PATCH 2/3] Expand documentation, add warning
c667ef
c667ef
Help text:
c667ef
c667ef
```
c667ef
$ createrepo_c --help-expert
c667ef
Usage:
c667ef
  createrepo_c [OPTION?] <directory_to_index>
c667ef
c667ef
Program that creates a repomd (xml-based rpm metadata) repository from a set of rpms.
c667ef
c667ef
Expert (risky) options
c667ef
  --ignore-lock                                Expert (risky) option:
c667ef
Ignore an existing .repodata/. (Remove the existing .repodata/ and create an empty new one to serve as a lock for other createrepo instances. For the repodata generation, a different temporary dir with the name in format ".repodata.time.microseconds.pid/" will be used).  NOTE: Use this option on your own risk! If two createrepos run simultaneously, then the state of the generated metadata is not guaranteed - it can be inconsistent and wrong.
c667ef
  --skip-filelists                             Expert (risky) option:
c667ef
Skip filelist generation, potentially saving significant bandwidth for repos with large numbers of files in packages. NOTE: Use this option on your own risk! This is a site-local optimization and should not be used for public repos. The site operator warrants the filenames in the packages in the repo are not named as required in other packages. The site operator also warrants that all clients do not need to use repoquery -l and -f to list or find packages that own a given file.
c667ef
```
c667ef
c667ef
Example run:
c667ef
c667ef
```
c667ef
$ createrepo_c --skip-filelists .
c667ef
Directory walk started
c667ef
Directory walk done - 9 packages
c667ef
Temporary output repo path: ./.repodata/
c667ef
Preparing sqlite DBs
c667ef
Warning: Expert option: --skip-filelists for site-local optimization active
c667ef
Pool started (with 5 workers)
c667ef
Pool finished
c667ef
```
c667ef
---
c667ef
 doc/createrepo_c.8 | 2 +-
c667ef
 src/cmd_parser.c   | 9 ++++++++-
c667ef
 src/createrepo_c.c | 4 ++++
c667ef
 3 files changed, 13 insertions(+), 2 deletions(-)
c667ef
c667ef
diff --git a/doc/createrepo_c.8 b/doc/createrepo_c.8
c667ef
index 10702f48..0788929d 100644
c667ef
--- a/doc/createrepo_c.8
c667ef
+++ b/doc/createrepo_c.8
c667ef
@@ -215,6 +215,6 @@ Exit with retval 2 if there were any errors during processing
c667ef
 Expert (risky) option: Ignore an existing .repodata/. (Remove the existing .repodata/ and create an empty new one to serve as a lock for other createrepo instances. For the repodata generation, a different temporary dir with the name in format .repodata.time.microseconds.pid/ will be used). NOTE: Use this option on your own risk! If two createrepos run simultaneously, then the state of the generated metadata is not guaranteed \- it can be inconsistent and wrong.
c667ef
 .SS \-\-skip\-filelists
c667ef
 .sp
c667ef
-Expert (risky) option: Skip filelist generation.
c667ef
+Expert (risky) option: Skip filelist generation, potentially saving significant bandwidth for repos with large numbers of files in packages. NOTE: Use this option on your own risk! This is a site-local optimization and should not be used for public repos. The site operator warrants the filenames in the packages in the repo are not named as required in other packages. The site operator also warrants that all clients do not need to use repoquery -l and -f to list or find packages that own a given file.
c667ef
 .\" Generated by docutils manpage writer.
c667ef
 .
c667ef
diff --git a/src/cmd_parser.c b/src/cmd_parser.c
c667ef
index 0ecf7f99..92a2ca9d 100644
c667ef
--- a/src/cmd_parser.c
c667ef
+++ b/src/cmd_parser.c
c667ef
@@ -225,7 +225,14 @@ static GOptionEntry expert_entries[] =
c667ef
       "generated metadata is not guaranteed - it can be inconsistent and wrong.",
c667ef
       NULL },
c667ef
     { "skip-filelists", 0, 0, G_OPTION_ARG_NONE, &(_cmd_options.skip_filelists),
c667ef
-      "Expert (risky) option: Skip filelist generation.", NULL},
c667ef
+      "Expert (risky) option: Skip filelist generation, potentially saving "
c667ef
+      "significant bandwidth for repos with large numbers of files in "
c667ef
+      "packages. NOTE: Use this option on your own risk! This is a site-local "
c667ef
+      "optimization and should not be used for public repos. The site "
c667ef
+      "operator warrants the filenames in the packages in the repo are not "
c667ef
+      "named as required in other packages. The site operator also warrants "
c667ef
+      "that all clients do not need to use repoquery -l and -f to list or "
c667ef
+      "find packages that own a given file.", NULL},
c667ef
     { NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL },
c667ef
 };
c667ef
 
c667ef
diff --git a/src/createrepo_c.c b/src/createrepo_c.c
c667ef
index 9dd288e5..f942b1d6 100644
c667ef
--- a/src/createrepo_c.c
c667ef
+++ b/src/createrepo_c.c
c667ef
@@ -1282,6 +1282,10 @@ main(int argc, char **argv)
c667ef
     g_mutex_init(&(user_data.mutex_old_md));
c667ef
     g_mutex_init(&(user_data.mutex_deltatargetpackages));
c667ef
 
c667ef
+    if (cmd_options->skip_filelists) {
c667ef
+      g_warning("Expert option: --skip-filelists for site-local optimization active");
c667ef
+    }
c667ef
+
c667ef
     g_debug("Thread pool user data ready");
c667ef
 
c667ef
     // Start pool
c667ef
c667ef
From 1688eb2a379aa440769d07eae71d0d44318f1afa Mon Sep 17 00:00:00 2001
c667ef
From: Matthew Almond <malmond@fb.com>
c667ef
Date: Mon, 7 Jun 2021 15:47:42 -0700
c667ef
Subject: [PATCH 3/3] Reworded
c667ef
c667ef
---
c667ef
 doc/createrepo_c.8 |  2 +-
c667ef
 src/cmd_parser.c   | 14 +++++++-------
c667ef
 2 files changed, 8 insertions(+), 8 deletions(-)
c667ef
c667ef
diff --git a/doc/createrepo_c.8 b/doc/createrepo_c.8
c667ef
index 0788929d..8e1e8b35 100644
c667ef
--- a/doc/createrepo_c.8
c667ef
+++ b/doc/createrepo_c.8
c667ef
@@ -215,6 +215,6 @@ Exit with retval 2 if there were any errors during processing
c667ef
 Expert (risky) option: Ignore an existing .repodata/. (Remove the existing .repodata/ and create an empty new one to serve as a lock for other createrepo instances. For the repodata generation, a different temporary dir with the name in format .repodata.time.microseconds.pid/ will be used). NOTE: Use this option on your own risk! If two createrepos run simultaneously, then the state of the generated metadata is not guaranteed \- it can be inconsistent and wrong.
c667ef
 .SS \-\-skip\-filelists
c667ef
 .sp
c667ef
-Expert (risky) option: Skip filelist generation, potentially saving significant bandwidth for repos with large numbers of files in packages. NOTE: Use this option on your own risk! This is a site-local optimization and should not be used for public repos. The site operator warrants the filenames in the packages in the repo are not named as required in other packages. The site operator also warrants that all clients do not need to use repoquery -l and -f to list or find packages that own a given file.
c667ef
+Expert (dangerous) option: Skip filelist generation, potentially saving significant bandwidth for repos with large numbers of files in packages. NOTE: Use this option on your own risk! This is a site-local optimization and should not be used for public repos. This option is known to break dependency resolution for any packages which depend on files provided by packages within this repository, e.g. /bin/bash). It will also prevent using repoquery -l and -f from listing files owned by a package or discover which package owns a particular file.
c667ef
 .\" Generated by docutils manpage writer.
c667ef
 .
c667ef
diff --git a/src/cmd_parser.c b/src/cmd_parser.c
c667ef
index 92a2ca9d..8c355fb5 100644
c667ef
--- a/src/cmd_parser.c
c667ef
+++ b/src/cmd_parser.c
c667ef
@@ -225,14 +225,14 @@ static GOptionEntry expert_entries[] =
c667ef
       "generated metadata is not guaranteed - it can be inconsistent and wrong.",
c667ef
       NULL },
c667ef
     { "skip-filelists", 0, 0, G_OPTION_ARG_NONE, &(_cmd_options.skip_filelists),
c667ef
-      "Expert (risky) option: Skip filelist generation, potentially saving "
c667ef
-      "significant bandwidth for repos with large numbers of files in "
c667ef
+      "Expert (dangerous) option: Skip filelist generation, potentially "
c667ef
+      "saving significant bandwidth for repos with large numbers of files in "
c667ef
       "packages. NOTE: Use this option on your own risk! This is a site-local "
c667ef
-      "optimization and should not be used for public repos. The site "
c667ef
-      "operator warrants the filenames in the packages in the repo are not "
c667ef
-      "named as required in other packages. The site operator also warrants "
c667ef
-      "that all clients do not need to use repoquery -l and -f to list or "
c667ef
-      "find packages that own a given file.", NULL},
c667ef
+      "optimization and should not be used for public repos. This option is "
c667ef
+      "known to break dependency resolution for any packages which depend on "
c667ef
+      "files provided by packages within this repository, e.g. /bin/bash). It "
c667ef
+      "will also prevent using repoquery -l and -f from listing files owned "
c667ef
+      "by a package or discover which package owns a particular file.", NULL},
c667ef
     { NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL },
c667ef
 };
c667ef