From 3d74e4c16387fe32b8fcf92b21201d05da9d7ea5 Mon Sep 17 00:00:00 2001 From: Peter Palfrader Date: Fri, 29 Sep 2017 11:30:44 +0200 Subject: [PATCH] Refactor logging. Keep a .lock on the master for all updates, instead of trying to lock individual directories. There was a race in static-master-update-component, where we would keep locks of and -updating.incoming-XXXXXX, and then move aside and replace it by -updating.incoming-XXXXXX in two steps. Things could fail if in between these two moves, another static-master-update-component job showed up, and created a new dir. --- modules/roles/files/static-mirroring/OVERVIEW | 63 +++++++++++++++++++ .../files/static-mirroring/static-master-run | 12 +++- .../static-master-update-component | 20 ++++-- .../static-mirroring/staticsync-ssh-wrap | 8 ++- 4 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 modules/roles/files/static-mirroring/OVERVIEW diff --git a/modules/roles/files/static-mirroring/OVERVIEW b/modules/roles/files/static-mirroring/OVERVIEW new file mode 100644 index 000000000..18365946e --- /dev/null +++ b/modules/roles/files/static-mirroring/OVERVIEW @@ -0,0 +1,63 @@ + +- static-update-component is run by the user on the source host. + If not run under sudo as the staticuser already, it sudos to the staticuser, + re-execing itself. It them sshs to the static-master for that component to + run static-master-update-component. + + LOCKING: + none, but see static-master-update-component + +- static-master-update-component is run on the static-master. + It rsyncs the contents from the source host to the static master, and then + triggers static-master-run to push the content to the mirrors. + + The sync happens to a new -updating.incoming-XXXXXX directory. On + sync success, is replaced with that new tree, and the + static-master-run trigger happens. + + LOCKING: + - exclusive locks are held on + - .lock + +- static-master-run triggers all the mirrors for a component to initiate syncs. + When all mirrors have an up-to-date tree, they are instructed to update + the cur-> symlink to the new tree. + + To begin with, static-master-run copies to -current-push. + This is the tree all the mirrors then sync from. If the push was successful, + -current-push is renamed to -current-live. + + LOCKING: + - exclusive locks are held on + - .lock + +- static-mirror-run runs on a mirror and syncs components. + There is a symlink called 'cur' that points to either tree-a or tree-b for + each component. the cur tree is the one that is live, the other one usually + does not exist, except when a sync is ongoing (or a previous one failed and + we keep a partial tree). + + During a sync, we sync to the tree- that is not the live one. When instructed by + static-master-run, we update the symlink and remove the old tree. + + static-mirror-run rsyncs either -current-push or -current-live for a component. + + LOCKING: + during all of static-mirror-run, we keep an exclusive lock on the + dir, i.e., the directory that holds tree-[ab] and cur. + +- static-mirror-run-all + + Run static-mirror-run for all components on this mirror, fetching the -live- tree. + + LOCKING: + none, but see static-mirror-run. + +- staticsync-ssh-wrap + + wrapper for ssh job dispatching on source, master, and mirror. + + LOCKING: + - on master, when syncing -live- trees: + a shared lock is held on .lock during + the rsync process. diff --git a/modules/roles/files/static-mirroring/static-master-run b/modules/roles/files/static-mirroring/static-master-run index aa616826d..7571badb5 100755 --- a/modules/roles/files/static-mirroring/static-master-run +++ b/modules/roles/files/static-mirroring/static-master-run @@ -174,14 +174,22 @@ def run_mirror(component): os.chmod(tmpdir_new, 0755) locks = [] - for p in (componentdir, live, tmpdir_new): - if not os.path.exists(p): os.mkdir(p, 0755) + lockfiles = [ os.path.join(basemaster, component + ".lock") ] + for p in lockfiles: fd = os.open(p, os.O_RDONLY) log("Acquiring lock for %s(%d)."%(p,fd)) fcntl.flock(fd, fcntl.LOCK_EX) locks.append(fd) log("All locks acquired.") + #for p in (componentdir, live, tmpdir_new): + # if not os.path.exists(p): os.mkdir(p, 0755) + # fd = os.open(p, os.O_RDONLY) + # log("Acquiring lock for %s(%d)."%(p,fd)) + # fcntl.flock(fd, fcntl.LOCK_EX) + # locks.append(fd) + #log("All locks acquired.") + serialfile = os.path.join(componentdir, serialname) try: with open(serialfile) as f: serial = int(f.read()) diff --git a/modules/roles/files/static-mirroring/static-master-update-component b/modules/roles/files/static-mirroring/static-master-update-component index 2d397df27..c115ae447 100755 --- a/modules/roles/files/static-mirroring/static-master-update-component +++ b/modules/roles/files/static-mirroring/static-master-update-component @@ -91,6 +91,14 @@ if [ -z "$srchost" ] || [ -z "$srcdir" ]; then echo >&2 "$0: Invalid component: $component (not found in $componentlist)"; exit 1 fi + +tgtlock="$masterbase/$component.lock" +if ! [ -e "$tgtlock" ]; then + touch "$tgtlock" +fi +echo "$0: Acquiring lock on $tgtlock..." +lock 203 "$tgtlock" 1 + tgt="$masterbase/$component" if ! [ -d "$tgt" ]; then echo "$0: Creating $tgt for $component"; @@ -103,15 +111,16 @@ else src="$srchost:$srcdir" fi -echo "$0: Acquiring locks..." -lock 201 "$tgt" 1 +#echo "$0: Acquiring lock on $tgt..." +#lock 201 "$tgt" 1 tmpdir_new="$(mktemp -d --tmpdir="$masterbase" "${component}-updating.incoming-XXXXXX")" tmpdir_old="$(mktemp -d --tmpdir="$masterbase" "${component}-updating.removing-XXXXXX")" trap "rm -rf '$tmpdir_new' '$tmpdir_old'" EXIT chmod 0755 "$tmpdir_new" -lock 202 "$tmpdir_new" 1 +#echo "$0: Acquiring lock on $tmpdir_new..." +#lock 202 "$tmpdir_new" 1 echo "$0: Got them." echo "$0: Updating master copy of $component..." @@ -136,8 +145,9 @@ rm -rf "$tmpdir_new" "$tmpdir_old" trap - EXIT date '+%s' > "$tgt/.serial" -unlock 201 -unlock 202 +#unlock 201 +#unlock 202 +unlock 203 echo "$0: Triggering mirror runs..." exec static-master-run "$component" diff --git a/modules/roles/files/static-mirroring/staticsync-ssh-wrap b/modules/roles/files/static-mirroring/staticsync-ssh-wrap index a4075471d..959b4f8f0 100755 --- a/modules/roles/files/static-mirroring/staticsync-ssh-wrap +++ b/modules/roles/files/static-mirroring/staticsync-ssh-wrap @@ -97,9 +97,13 @@ do_rsync_on_master() { elif [ "$*" = "$args $component/-live-/" ] || [ "$*" = "$args ./$component/-live-/" ] ; then local path="$BASEDIR/master/$component-current-live" info "host $remote_host wants $path, acquiring lock" - exec 200< "$path" + tgtlock="$BASEDIR/master/$component.lock" + if ! [ -e "$tgtlock" ]; then + touch "$tgtlock" + fi + exec 200< "$tgtlock" if ! flock -s -w 0 200; then - echo >&2 "Cannot acquire shared lock on $path - this should mean an update is already underway anyway." + echo >&2 "Cannot acquire shared lock on $tgtlock covering $path - this should mean an update is already underway anyway." exit 1 fi exec rsync $args "$path/." -- 2.20.1