Refactor logging.
authorPeter Palfrader <peter@palfrader.org>
Fri, 29 Sep 2017 09:30:44 +0000 (11:30 +0200)
committerPeter Palfrader <peter@palfrader.org>
Fri, 29 Sep 2017 09:30:44 +0000 (11:30 +0200)
Keep a <component>.lock on the master for all updates, instead of trying
to lock individual directories.

There was a race in static-master-update-component, where we would keep
locks of <component> and <component>-updating.incoming-XXXXXX, and then
move <component> aside and replace it by
<component>-updating.incoming-XXXXXX in two steps.  Things could fail
if in between these two moves, another static-master-update-component
job showed up, and created a new <component> dir.

modules/roles/files/static-mirroring/OVERVIEW [new file with mode: 0644]
modules/roles/files/static-mirroring/static-master-run
modules/roles/files/static-mirroring/static-master-update-component
modules/roles/files/static-mirroring/staticsync-ssh-wrap

diff --git a/modules/roles/files/static-mirroring/OVERVIEW b/modules/roles/files/static-mirroring/OVERVIEW
new file mode 100644 (file)
index 0000000..1836594
--- /dev/null
@@ -0,0 +1,63 @@
+
+- static-update-component is run by the user on the source host.
+  If not run under sudo as the staticuser already, it sudos to the staticuser,
+  re-execing itself.  It them sshs to the static-master for that component to
+  run static-master-update-component.
+
+  LOCKING:
+    none, but see static-master-update-component
+
+- static-master-update-component is run on the static-master.
+  It rsyncs the contents from the source host to the static master, and then
+  triggers static-master-run to push the content to the mirrors.
+
+  The sync happens to a new <component>-updating.incoming-XXXXXX directory.  On
+  sync success, <component> is replaced with that new tree, and the
+  static-master-run trigger happens.
+
+  LOCKING:
+    - exclusive locks are held on
+      - <component>.lock
+
+- static-master-run triggers all the mirrors for a component to initiate syncs.
+  When all mirrors have an up-to-date tree, they are instructed to update
+  the cur-> symlink to the new tree.
+
+  To begin with, static-master-run copies <component> to <component>-current-push.
+  This is the tree all the mirrors then sync from.  If the push was successful,
+  <component>-current-push is renamed to <component>-current-live.
+
+  LOCKING:
+    - exclusive locks are held on
+      - <component>.lock
+
+- static-mirror-run runs on a mirror and syncs components.
+  There is a symlink called 'cur' that points to either tree-a or tree-b for
+  each component.  the cur tree is the one that is live, the other one usually
+  does not exist, except when a sync is ongoing (or a previous one failed and
+  we keep a partial tree).
+
+  During a sync, we sync to the tree-<X> that is not the live one.  When instructed by
+  static-master-run, we update the symlink and remove the old tree.
+
+  static-mirror-run rsyncs either -current-push or -current-live for a component.
+
+  LOCKING:
+    during all of static-mirror-run, we keep an exclusive lock on the <component>
+    dir, i.e., the directory that holds tree-[ab] and cur.
+
+- static-mirror-run-all
+
+  Run static-mirror-run for all components on this mirror, fetching the -live- tree.
+
+  LOCKING:
+    none, but see static-mirror-run.
+
+- staticsync-ssh-wrap
+
+  wrapper for ssh job dispatching on source, master, and mirror.
+
+  LOCKING:
+    - on master, when syncing -live- trees:
+        a shared lock is held on <component>.lock during
+        the rsync process.
index aa61682..7571bad 100755 (executable)
@@ -174,14 +174,22 @@ def run_mirror(component):
   os.chmod(tmpdir_new, 0755)
 
   locks = []
-  for p in (componentdir, live, tmpdir_new):
-    if not os.path.exists(p): os.mkdir(p, 0755)
+  lockfiles = [ os.path.join(basemaster, component + ".lock") ]
+  for p in lockfiles:
     fd = os.open(p, os.O_RDONLY)
     log("Acquiring lock for %s(%d)."%(p,fd))
     fcntl.flock(fd, fcntl.LOCK_EX)
     locks.append(fd)
   log("All locks acquired.")
 
+  #for p in (componentdir, live, tmpdir_new):
+  #  if not os.path.exists(p): os.mkdir(p, 0755)
+  #  fd = os.open(p, os.O_RDONLY)
+  #  log("Acquiring lock for %s(%d)."%(p,fd))
+  #  fcntl.flock(fd, fcntl.LOCK_EX)
+  #  locks.append(fd)
+  #log("All locks acquired.")
+
   serialfile = os.path.join(componentdir, serialname)
   try:
     with open(serialfile) as f: serial = int(f.read())
index 2d397df..c115ae4 100755 (executable)
@@ -91,6 +91,14 @@ if [ -z "$srchost" ] || [ -z "$srcdir" ]; then
   echo >&2 "$0: Invalid component: $component (not found in $componentlist)";
   exit 1
 fi
+
+tgtlock="$masterbase/$component.lock"
+if ! [ -e "$tgtlock" ]; then
+  touch "$tgtlock"
+fi
+echo "$0: Acquiring lock on $tgtlock..."
+lock 203 "$tgtlock" 1
+
 tgt="$masterbase/$component"
 if ! [ -d "$tgt" ]; then
   echo "$0: Creating $tgt for $component";
@@ -103,15 +111,16 @@ else
   src="$srchost:$srcdir"
 fi
 
-echo "$0: Acquiring locks..."
-lock 201 "$tgt" 1
+#echo "$0: Acquiring lock on $tgt..."
+#lock 201 "$tgt" 1
 
 tmpdir_new="$(mktemp -d --tmpdir="$masterbase" "${component}-updating.incoming-XXXXXX")"
 tmpdir_old="$(mktemp -d --tmpdir="$masterbase" "${component}-updating.removing-XXXXXX")"
 trap "rm -rf '$tmpdir_new' '$tmpdir_old'" EXIT
 chmod 0755 "$tmpdir_new"
 
-lock 202 "$tmpdir_new" 1
+#echo "$0: Acquiring lock on $tmpdir_new..."
+#lock 202 "$tmpdir_new" 1
 echo "$0: Got them."
 
 echo "$0: Updating master copy of $component..."
@@ -136,8 +145,9 @@ rm -rf "$tmpdir_new" "$tmpdir_old"
 trap - EXIT
 
 date '+%s' > "$tgt/.serial"
-unlock 201
-unlock 202
+#unlock 201
+#unlock 202
+unlock 203
 echo "$0: Triggering mirror runs..."
 exec static-master-run "$component"
 
index a407547..959b4f8 100755 (executable)
@@ -97,9 +97,13 @@ do_rsync_on_master() {
                        elif [ "$*" = "$args $component/-live-/" ] || [ "$*" = "$args ./$component/-live-/" ] ; then
                                local path="$BASEDIR/master/$component-current-live"
                                info "host $remote_host wants $path, acquiring lock"
-                               exec 200< "$path"
+                               tgtlock="$BASEDIR/master/$component.lock"
+                               if ! [ -e "$tgtlock" ]; then
+                                       touch "$tgtlock"
+                               fi
+                               exec 200< "$tgtlock"
                                if ! flock -s -w 0 200; then
-                               echo >&2 "Cannot acquire shared lock on $path - this should mean an update is already underway anyway."
+                               echo >&2 "Cannot acquire shared lock on $tgtlock covering $path - this should mean an update is already underway anyway."
                                exit 1
                                fi
                                exec rsync $args "$path/."