Branch data Line data Source code
1 : : /* SPDX-License-Identifier: GPL-2.0-only */
2 : : /*
3 : : * Copyright (c) Meta Platforms, Inc. and affiliates.
4 : : */
5 : :
6 : : /* `renameat2` and `RENAME_NOREPLACE` require _GNU_SOURCE from glibc. */
7 : : #ifndef _GNU_SOURCE
8 : : #define _GNU_SOURCE
9 : : #endif
10 : :
11 : : #include "core/lock.h"
12 : :
13 : : #include <dirent.h>
14 : : #include <errno.h>
15 : : #include <fcntl.h>
16 : : #include <stdio.h>
17 : : #include <stdlib.h>
18 : : #include <string.h>
19 : : #include <sys/file.h>
20 : : #include <sys/stat.h>
21 : : #include <sys/types.h>
22 : : #include <time.h>
23 : : #include <unistd.h>
24 : :
25 : : #include <bpfilter/ctx.h>
26 : : #include <bpfilter/helper.h>
27 : : #include <bpfilter/io.h>
28 : : #include <bpfilter/logger.h>
29 : :
30 : : #define BF_PERM_755 (S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)
31 : :
32 : : /** Bounded retry count for the "recheck-after-flock" loop (P1). Each failed
33 : : * attempt corresponds to a completed `unlink + recreate` by another
34 : : * `BF_LOCK_WRITE` holder, so this budget is extremely generous in practice. */
35 : : #define BF_LOCK_MAX_RETRIES 8
36 : :
37 : : /** Bounded retry count for the staging name collision loop. Collisions are
38 : : * astronomically rare given the random suffix, so a small budget suffices. */
39 : : #define BF_LOCK_STAGING_NAME_RETRIES 4
40 : :
41 : : /** Number of random bytes pulled from `/dev/urandom` for the staging suffix. */
42 : : #define BF_LOCK_STAGING_RAND_BYTES 8
43 : :
44 : : /**
45 : : * @brief Apply an `flock(2)` of the requested mode on `fd`.
46 : : *
47 : : * `BF_LOCK_NONE` is a no-op; `BF_LOCK_READ` maps to `LOCK_SH`; `BF_LOCK_WRITE`
48 : : * maps to `LOCK_EX`. All requests are non-blocking (`LOCK_NB`): contention
49 : : * returns `-EWOULDBLOCK` immediately rather than waiting.
50 : : *
51 : : * @param fd File descriptor to lock.
52 : : * @param mode Locking mode, see `bf_lock_mode`.
53 : : * @return 0 on success, or a negative errno value on failure.
54 : : */
55 : 14181 : static int _bf_flock(int fd, enum bf_lock_mode mode)
56 : : {
57 : : int op = LOCK_NB;
58 : :
59 [ + - ]: 14181 : if (mode >= _BF_LOCK_MAX)
60 : : return -EINVAL;
61 : :
62 [ + - ]: 14181 : if (fd < 0)
63 : : return -EBADFD;
64 : :
65 [ + + ]: 14181 : if (mode == BF_LOCK_NONE)
66 : : return 0;
67 : :
68 [ + + ]: 14177 : op |= (mode == BF_LOCK_WRITE) ? LOCK_EX : LOCK_SH;
69 : :
70 [ + + ]: 14177 : if (flock(fd, op) < 0)
71 : 4 : return -errno;
72 : :
73 : : return 0;
74 : : }
75 : :
76 : : /**
77 : : * @brief Fill `buf` with a unique staging name.
78 : : *
79 : : * The name has the form `<prefix><pid>_<hex>` where `<hex>` is a hex-encoded
80 : : * random suffix pulled from `/dev/urandom`. Uniqueness isn't strictly required
81 : : * (collisions cause `mkdirat(EEXIST)` and are retried by the caller), but a
82 : : * unique name avoids any chance of contention on the staging flock.
83 : : *
84 : : * If `/dev/urandom` cannot be read, fall back to a name based on `pid` and
85 : : * `time(NULL)`; the caller's retry loop will paper over the rare collision
86 : : * that this fallback could produce.
87 : : *
88 : : * @param buf Buffer to write the staging name into.
89 : : * @param size Size of `buf`.
90 : : */
91 : 1310 : static void _bf_make_staging_name(char *buf, size_t size)
92 : : {
93 : 1310 : unsigned char rand[BF_LOCK_STAGING_RAND_BYTES] = {0};
94 : : char hex[(BF_LOCK_STAGING_RAND_BYTES * 2) + 1];
95 : 2620 : _cleanup_close_ int fd = -1;
96 : : ssize_t n = -1;
97 : :
98 : 1310 : fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
99 [ + - ]: 1310 : if (fd >= 0)
100 : 1310 : n = read(fd, rand, sizeof(rand));
101 : :
102 [ - + ]: 1310 : if (n != (ssize_t)sizeof(rand)) {
103 : : /* Fallback: derive bytes from `time(NULL)` so two processes that
104 : : * both fail to read /dev/urandom in the same second still differ on
105 : : * `pid`. Collisions are handled by the caller's retry loop. */
106 : 0 : unsigned long fallback = (unsigned long)time(NULL);
107 [ # # ]: 0 : for (size_t i = 0; i < sizeof(rand); ++i)
108 : 0 : rand[i] = (unsigned char)(fallback >> (i * 8));
109 : : }
110 : :
111 [ + + ]: 11790 : for (size_t i = 0; i < sizeof(rand); ++i)
112 : 10480 : (void)snprintf(&hex[i * 2], 3, "%02x", rand[i]);
113 : :
114 : : /* bpffs rejects names starting with '.', so the prefix and format must
115 : : * stick to [a-zA-Z0-9_-]. */
116 : 1310 : (void)snprintf(buf, size, "%s%d_%s", BF_LOCK_STAGING_PREFIX, (int)getpid(),
117 : : hex);
118 : 1310 : }
119 : :
120 : : /**
121 : : * @brief Stage-and-rename primitive (I3).
122 : : *
123 : : * Create a uniquely-named staging directory under `pindir_fd`, open it,
124 : : * acquire an exclusive flock on it, then atomically publish it as `name`
125 : : * via `renameat2(RENAME_NOREPLACE)`.
126 : : *
127 : : * On success, returns a locked file descriptor referring to the inode now
128 : : * reachable as `<pindir>/<name>`.
129 : : *
130 : : * On failure, any state created (staging dir) is cleaned up; no side
131 : : * effects leak out.
132 : : * @param pindir_fd File descriptor of the pin directory.
133 : : * @param name Name of the directory (in the pin directory) to open.
134 : : * @return The open and locked file descriptor on success, or a negative errno
135 : : * value on failure.
136 : : */
137 : 1310 : static int _bf_lock_stage_and_publish(int pindir_fd, const char *name)
138 : : {
139 : : char staging[NAME_MAX];
140 : 1310 : _cleanup_close_ int fd = -1;
141 : : int r;
142 : :
143 : : assert(name);
144 : :
145 : : /* 1. Create a unique staging directory. Retry a small number of times
146 : : * if we happen to collide with our own prior staging dirs (should
147 : : * be astronomically rare given the random suffix). */
148 : 1310 : for (int attempt = 0; attempt < BF_LOCK_STAGING_NAME_RETRIES; ++attempt) {
149 : 1310 : _bf_make_staging_name(staging, sizeof(staging));
150 [ - + ]: 1310 : if (mkdirat(pindir_fd, staging, BF_PERM_755) == 0)
151 : : break;
152 [ # # ]: 0 : if (errno != EEXIST) {
153 [ # # ]: 0 : return bf_err_r(-errno,
154 : : "failed to create staging dir '%s' under pindir",
155 : : staging);
156 : : }
157 [ # # ]: 0 : if (attempt == BF_LOCK_STAGING_NAME_RETRIES - 1) {
158 [ # # ]: 0 : return bf_err_r(
159 : : -EAGAIN,
160 : : "failed to generate a unique staging dir name after retries");
161 : : }
162 : : }
163 : :
164 : : /* 2. Open the staging directory. */
165 : 1310 : fd = openat(pindir_fd, staging, O_DIRECTORY);
166 [ - + ]: 1310 : if (fd < 0) {
167 : 0 : r = -errno;
168 : 0 : (void)unlinkat(pindir_fd, staging, AT_REMOVEDIR);
169 [ # # ]: 0 : return bf_err_r(r, "failed to open staging dir '%s'", staging);
170 : : }
171 : :
172 : : /* 3. Exclusively lock the staging inode. Cannot contend: we own the
173 : : * unique staging name. */
174 : 1310 : r = _bf_flock(fd, BF_LOCK_WRITE);
175 [ - + ]: 1310 : if (r) {
176 : 0 : (void)unlinkat(pindir_fd, staging, AT_REMOVEDIR);
177 [ # # ]: 0 : return bf_err_r(r, "failed to flock staging dir '%s'", staging);
178 : : }
179 : :
180 : : /* 4. Publish atomically. `RENAME_NOREPLACE` ensures we lose cleanly to
181 : : * any concurrent creator that already claimed `name`. */
182 [ - + ]: 1310 : if (renameat2(pindir_fd, staging, pindir_fd, name, RENAME_NOREPLACE) < 0) {
183 : 0 : r = -errno;
184 : : /* Staging dir was never published; safe to remove (we hold its
185 : : * flock, nobody else can observe or lock it). */
186 : 0 : (void)unlinkat(pindir_fd, staging, AT_REMOVEDIR);
187 : 0 : return r;
188 : : }
189 : :
190 : 1310 : return TAKE_FD(fd);
191 : : }
192 : :
193 : : /**
194 : : * @brief Open an existing chain dir with the recheck-after-flock protocol
195 : : * (P1).
196 : : *
197 : : * Between `openat` and `flock`, the name might be unlinked and recreated by
198 : : * another `BF_LOCK_WRITE` holder, which would leave us holding a lock on an
199 : : * orphaned inode. Detect this by comparing the inode we locked against the
200 : : * one currently reachable via the name, and retry on mismatch.
201 : : *
202 : : * On success, returns a locked file descriptor whose inode is guaranteed to
203 : : * be the one currently bound to `name`.
204 : : * @param pindir_fd File descriptor of the directory containing `name`.
205 : : * @param name Name of the directory to open and lock.
206 : : * @param mode Locking mode for `name`.
207 : : * @return Open and locked file descriptor to `name` in `pindir_fd`, or a
208 : : * negative errno value on failure.
209 : : */
210 : 6570 : static int _bf_lock_open_existing(int pindir_fd, const char *name,
211 : : enum bf_lock_mode mode)
212 : : {
213 : : int r;
214 : :
215 : : assert(name);
216 : :
217 [ + - ]: 6570 : for (int attempt = 0; attempt < BF_LOCK_MAX_RETRIES; ++attempt) {
218 : 6570 : _cleanup_close_ int fd = -1;
219 : : struct stat open_st;
220 : : struct stat live_st;
221 : :
222 : 6570 : fd = openat(pindir_fd, name, O_DIRECTORY);
223 [ + + ]: 6570 : if (fd < 0)
224 : 640 : return -errno;
225 : :
226 : 5930 : r = _bf_flock(fd, mode);
227 [ + - ]: 5930 : if (r)
228 : : return r;
229 : :
230 [ - + ]: 5930 : if (fstat(fd, &open_st) < 0)
231 : 0 : return -errno;
232 : :
233 [ - + ]: 5930 : if (fstatat(pindir_fd, name, &live_st, AT_SYMLINK_NOFOLLOW) < 0) {
234 : : /* Name is gone (ENOENT) or inaccessible. Retry. */
235 : : continue;
236 : : }
237 : :
238 [ + - ]: 5930 : if (open_st.st_dev == live_st.st_dev &&
239 [ + - ]: 5930 : open_st.st_ino == live_st.st_ino)
240 : 5930 : return TAKE_FD(fd);
241 : :
242 : : /* Mismatch: the name now resolves to a different inode. Our fd is
243 : : * pinned to the orphaned inode; drop it and retry. The flock is
244 : : * released when fd is closed. */
245 : : }
246 : :
247 [ # # ]: 0 : return bf_err_r(
248 : : -EAGAIN,
249 : : "failed to stably open chain '%s' after %d retries; likely extreme contention",
250 : : name, BF_LOCK_MAX_RETRIES);
251 : : }
252 : :
253 : 6942 : int bf_lock_init(struct bf_lock *lock, enum bf_lock_mode mode)
254 : : {
255 : 6942 : _clean_bf_lock_ struct bf_lock _lock = bf_lock_default();
256 : : const char *bpffs_path;
257 : : int r;
258 : :
259 : : assert(lock);
260 : :
261 : 6942 : bpffs_path = bf_ctx_get_bpffs_path();
262 [ + + ]: 6942 : if (!bpffs_path)
263 [ + - ]: 1 : return bf_err_r(-EINVAL, "context is not initialized");
264 : :
265 : 6941 : _lock.bpffs_fd = bf_opendir(bpffs_path);
266 [ - + ]: 6941 : if (_lock.bpffs_fd < 0) {
267 [ # # ]: 0 : return bf_err_r(_lock.bpffs_fd, "failed to open bpffs at %s",
268 : : bpffs_path);
269 : : }
270 : :
271 : : /* Create the pin directory lazily. Per I1, it is never removed by the
272 : : * library, so subsequent `openat` calls always see the same inode. */
273 : 6941 : _lock.pindir_fd = bf_opendir_at(_lock.bpffs_fd, "bpfilter", true);
274 [ - + ]: 6941 : if (_lock.pindir_fd < 0) {
275 [ # # ]: 0 : return bf_err_r(_lock.pindir_fd,
276 : : "failed to open pin directory %s/bpfilter", bpffs_path);
277 : : }
278 : :
279 : 6941 : r = _bf_flock(_lock.pindir_fd, mode);
280 [ + + ]: 6941 : if (r)
281 : : return r;
282 : :
283 : 6937 : _lock.pindir_lock = mode;
284 : :
285 : 6937 : bf_swap(*lock, _lock);
286 : :
287 : 6937 : return 0;
288 : : }
289 : :
290 : 4124 : int bf_lock_init_for_chain(struct bf_lock *lock, const char *name,
291 : : enum bf_lock_mode pindir_mode,
292 : : enum bf_lock_mode chain_mode, bool create)
293 : : {
294 : 4124 : _clean_bf_lock_ struct bf_lock _lock = bf_lock_default();
295 : : int r;
296 : :
297 : : assert(lock);
298 : : assert(name);
299 : :
300 [ + + ]: 4124 : if (create && pindir_mode != BF_LOCK_WRITE) {
301 [ + - ]: 1 : return bf_err_r(
302 : : -EINVAL,
303 : : "creating a chain requires BF_LOCK_WRITE on the pin directory");
304 : : }
305 : :
306 : 4123 : r = bf_lock_init(&_lock, pindir_mode);
307 [ + - ]: 4123 : if (r)
308 : : return r;
309 : :
310 : 4123 : r = bf_lock_acquire_chain(&_lock, name, chain_mode, create);
311 [ + + ]: 4123 : if (r)
312 : : return r;
313 : :
314 : 4112 : bf_swap(*lock, _lock);
315 : :
316 : 4112 : return 0;
317 : : }
318 : :
319 : 18010 : void bf_lock_cleanup(struct bf_lock *lock)
320 : : {
321 : : assert(lock);
322 : :
323 : : // Quick exit if `lock` wasn't initialized
324 [ + + ]: 18010 : if (lock->bpffs_fd < 0)
325 : : return;
326 : :
327 : 6941 : bf_lock_release_chain(lock);
328 : :
329 : : /* Per I1, do NOT remove the pin directory. It persists for the
330 : : * lifetime of the bpffs mount. */
331 : 6941 : closep(&lock->pindir_fd);
332 : 6941 : lock->pindir_lock = BF_LOCK_NONE;
333 : :
334 : 6941 : closep(&lock->bpffs_fd);
335 : : }
336 : :
337 : 7885 : int bf_lock_acquire_chain(struct bf_lock *lock, const char *name,
338 : : enum bf_lock_mode mode, bool create)
339 : : {
340 : : _cleanup_free_ char *_name = NULL;
341 : 7885 : _cleanup_close_ int chain_fd = -1;
342 : :
343 : : assert(lock);
344 : : assert(name);
345 : :
346 [ + + - + ]: 7885 : if (lock->bpffs_fd < 0 || lock->pindir_fd < 0) {
347 [ + - ]: 1 : return bf_err_r(
348 : : -EBADFD,
349 : : "attempting to acquire a chain lock on an invalid bf_lock");
350 : : }
351 : :
352 [ + + ]: 7884 : if (lock->chain_fd >= 0) {
353 [ + - ]: 2 : return bf_err_r(-EINVAL, "bf_lock already locks chain '%s'",
354 : : lock->chain_name);
355 : : }
356 : :
357 [ + + ]: 7882 : if (create) {
358 [ + + ]: 1312 : if (mode != BF_LOCK_WRITE) {
359 [ + - ]: 2 : return bf_err_r(
360 : : -EINVAL,
361 : : "creating a chain requires BF_LOCK_WRITE on the chain directory");
362 : : }
363 [ - + ]: 1310 : if (lock->pindir_lock != BF_LOCK_WRITE) {
364 [ # # ]: 0 : return bf_err_r(
365 : : -EINVAL,
366 : : "creating a chain requires BF_LOCK_WRITE on the pin directory");
367 : : }
368 : : }
369 : :
370 : 7880 : _name = strdup(name);
371 [ + - ]: 7880 : if (!_name)
372 : : return -ENOMEM;
373 : :
374 [ + + ]: 7880 : if (create) {
375 : : /* Stage-and-rename (I3). Returns a locked fd already reachable at
376 : : * the final name. */
377 : 1310 : chain_fd = _bf_lock_stage_and_publish(lock->pindir_fd, _name);
378 [ + - ]: 1310 : if (chain_fd < 0)
379 : : return chain_fd;
380 : : } else {
381 : : /* Recheck-after-flock (P1). Returns a locked fd for the live
382 : : * inode of `name`, or an error. */
383 : 6570 : chain_fd = _bf_lock_open_existing(lock->pindir_fd, _name, mode);
384 [ + + ]: 6570 : if (chain_fd < 0)
385 : : return chain_fd;
386 : : }
387 : :
388 : 7240 : lock->chain_fd = TAKE_FD(chain_fd);
389 : 7240 : lock->chain_name = TAKE_PTR(_name);
390 : 7240 : lock->chain_lock = mode;
391 : :
392 : 7240 : return 0;
393 : : }
394 : :
395 : 8801 : void bf_lock_release_chain(struct bf_lock *lock)
396 : : {
397 : : assert(lock);
398 : :
399 [ + + - + ]: 8801 : if (lock->bpffs_fd < 0 || lock->pindir_fd < 0) {
400 [ + - ]: 1 : bf_warn("attempting to release a chain lock on an invalid bf_lock");
401 : 1 : return;
402 : : }
403 : :
404 [ + + ]: 8800 : if (lock->chain_fd < 0)
405 : : return;
406 : :
407 : : /* Only WRITE locks will be used to create or modify a chain, meaning
408 : : * if a READ or NONE lock was held on a chain directory, that directory
409 : : * hasn't been modified during the lifetime of the lock. So there is no
410 : : * need to attempt to remove it.
411 : : *
412 : : * Per I2, this removal is only race-free against concurrent readers if
413 : : * the caller also holds BF_LOCK_WRITE on the pin directory. The
414 : : * locking matrix documented in `lock.h` ensures this. */
415 [ + + ]: 7240 : if (lock->chain_lock == BF_LOCK_WRITE)
416 : 2604 : (void)unlinkat(lock->pindir_fd, lock->chain_name, AT_REMOVEDIR);
417 : :
418 : 7240 : closep(&lock->chain_fd);
419 : :
420 : : freep((void *)&lock->chain_name);
421 : 7240 : lock->chain_lock = BF_LOCK_NONE;
422 : : }
|