10

The goal here is to keep every running process' executable code in memory during memory pressure, in Linux.
In Linux, I am able to instantly (1 sec) cause high memory pressure and trigger the OOM-killer by stress --vm-bytes $(awk '/MemAvailable/{printf "%d\n", $2 + 4000;}' < /proc/meminfo)k --vm-keep -m 4 --timeout 10s (code from here) with 24000MB max RAM inside a Qubes OS R4.0 Fedora 28 AppVM. EDIT4: Perhaps relevant, and yet I forgot to mention, is the fact that I've no swap enabled (ie. CONFIG_SWAP is not set)

dmesg reports:

[  867.746593] Mem-Info:
[  867.746607] active_anon:1390927 inactive_anon:4670 isolated_anon:0
                active_file:94 inactive_file:72 isolated_file:0
                unevictable:13868 dirty:0 writeback:0 unstable:0
                slab_reclaimable:5906 slab_unreclaimable:12919
                mapped:1335 shmem:4805 pagetables:5126 bounce:0
                free:40680 free_pcp:978 free_cma:0

The interesting parts are active_file:94 inactive_file:72 they are in kilobytes and are very low.

The problem here is that, during that period of memory pressure, executable code is being re-read from disk causing disk thrashing which leads to frozen OS. (but in the above case it only happens for less than 1 second)

I see an interesting code in kernel mm/vmscan.c:

        if (page_referenced(page, 0, sc->target_mem_cgroup,
                            &vm_flags)) {
                nr_rotated += hpage_nr_pages(page);
                /*
                 * Identify referenced, file-backed active pages and
                 * give them one more trip around the active list. So
                 * that executable code get better chances to stay in
                 * memory under moderate memory pressure.  Anon pages
                 * are not likely to be evicted by use-once streaming
                 * IO, plus JVM can create lots of anon VM_EXEC pages,
                 * so we ignore them here.
                 */
                if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
                        list_add(&page->lru, &l_active);
                        continue;
                }
        }

I'm think that if someone could point out how to change this so that instead of give them one more trip around the active list we get it to give them infinite trips around the active list, then job should be done. Or maybe there's some other way?

I can patch and test custom kernel. I just don't have the know-how as to what to change in the code in order to always keep active executable code in memory(which in effect, I believe, would avoid disk thrashing).

EDIT: Here's what I got working so far (applied on top of kernel 4.18.5):

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..7636498 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -208,7 +208,7 @@ enum lru_list {

 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

-#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
+#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_INACTIVE_FILE; lru++)

 static inline int is_file_lru(enum lru_list lru)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..1f3ffb5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2234,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,

    anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
        lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
-   file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+   file  = //lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
        lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);

    spin_lock_irq(&pgdat->lru_lock);
@@ -2345,7 +2345,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
             sc->priority == DEF_PRIORITY);

    blk_start_plug(&plug);
-   while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+   while (nr[LRU_INACTIVE_ANON] || //nr[LRU_ACTIVE_FILE] ||
                    nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;
@@ -2372,7 +2372,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * stop reclaiming one LRU and reduce the amount scanning
         * proportional to the original scan target.
         */
-       nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+       nr_file = nr[LRU_INACTIVE_FILE] //+ nr[LRU_ACTIVE_FILE]
+           ;
        nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

        /*
@@ -2391,7 +2392,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
            percentage = nr_anon * 100 / scan_target;
        } else {
            unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
-                       targets[LRU_ACTIVE_FILE] + 1;
+                       //targets[LRU_ACTIVE_FILE] + 
+                       1;
            lru = LRU_FILE;
            percentage = nr_file * 100 / scan_target;
        }

Also seen here on github because in the above code, tabs got transformed into spaces! (mirror1, mirror2)
I've tested the above patch(on 4000MB max RAM now, yes 20G less than before!) even with a Firefox compilation that was known to disk thrash the OS into a permanent freeze, and it does not happen anymore (oom-killer is almost instantly killing the offending process(es)), also with the above stress command which now yields:

[  745.830511] Mem-Info:
[  745.830521] active_anon:855546 inactive_anon:20453 isolated_anon:0
                active_file:26925 inactive_file:76 isolated_file:0
                unevictable:10652 dirty:0 writeback:0 unstable:0
                slab_reclaimable:26975 slab_unreclaimable:13525
                mapped:24238 shmem:20456 pagetables:4028 bounce:0
                free:14935 free_pcp:177 free_cma:0

That's active_file:26925 inactive_file:76, almost 27 megs of active file...
So, I don't know how good this is. Am I keeping all active files instead of just executable files in memory ? During firefox compilation I've had like 500meg of Active(file)(EDIT2: but that's according to: cat /proc/meminfo|grep -F -- 'Active(file)' which shows different value than the above active_file: from dmesg!!!) which makes me doubt it was only exes/libs...
Maybe someone can suggest how to keep ONLY executable code ?(if that's not what's already happening)
Thoughts?

EDIT3: with the above patch, it seems perhaps necessary to (periodically?) run sudo sysctl vm.drop_caches=1 to free some stale memory(?), so that if I call stress after a firefox compilation I get: active_file:142281 inactive_file:0 isolated_file:0 (142megs) then drop file caches (another way: echo 1|sudo tee /proc/sys/vm/drop_caches) then run stress again, I get: active_file:22233 inactive_file:160 isolated_file:0 (22megs) - I am unsure...

Results without the above patch: here
Results with the above patch: here

8
  • 2
    I would try to piggyback on mlockall() somehow if there was only single specific executable I wanted to keep in core. To keep multiple executables in memory, I would think of creating small ramfs partition and copying required executables there.
    – gudok
    Commented Aug 31, 2018 at 9:35
  • 1
    Also consider using ealyoom.
    – gudok
    Commented Aug 31, 2018 at 9:46
  • 1
    @gudok I need all active executables kept in RAM so that when context switches happen (more specifically, process resumes execution) they(file-backed executable code pages) don't have to be re-read from disk (which is what's causing the disk thrashing). So far, the patch from EDIT does the job of keeping every active executable in RAM(it seems) thus reducing disk thrashing almost entirely and thus I experience no more OS permanently freezing. Thanks for the earlyoom link!
    – user10239615
    Commented Aug 31, 2018 at 10:11
  • 1
    I would suggest trying to set minimum limit for the amount of active pages instead of blocking all eviction of active pages. It should be pretty easy to trigger OOM killer if the amount of active pages go low enough and memory available is low in the same time. That would allow kernel to keep some smarts about cleaning up the active list but still limit the worst case behavior until OOM killer is triggered. Commented Jan 8, 2019 at 13:29
  • 2
    @MikkoRantalainen I finally did it today and it works, but for some reason it freezes the system most of the time (I probably missed something?). But anyway I managed to keep 256MiB of Active(file): and thus disk thrashing is gone. Now if only it wouldn't freeze for some other reason! See le9g.patch in this comment gist.github.com/constantoverride/…
    – user11509478
    Commented Aug 13, 2019 at 13:16

3 Answers 3

4

WARNING: Do not use this patch if you have swap enabled, because two users reported worse effects. I've only tested this patch with swap disabled in kernel! (ie. CONFIG_SWAP is not set)

Until further notice(or someone comes up with something better), I am using (and it works, for me) the following patch in order to avoid any disk thrashing / OS freeze when about to run Out Of Memory and thus the OOM-killer triggers as soon as possible(max 1 sec):

revision 3
preliminary patch to avoid disk thrashing (constant reading) under memory pressure before OOM-killer triggers
more info: https://gist.github.com/constantoverride/84eba764f487049ed642eb2111a20830

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..7636498 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -208,7 +208,7 @@ enum lru_list {

 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

-#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
+#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_INACTIVE_FILE; lru++)

 static inline int is_file_lru(enum lru_list lru)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..1f3ffb5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2086,9 +2086,9 @@ static unsigned long shrink_list(enum lr
                 struct scan_control *sc)
 {
    if (is_active_lru(lru)) {
-       if (inactive_list_is_low(lruvec, is_file_lru(lru),
-                    memcg, sc, true))
-           shrink_active_list(nr_to_scan, lruvec, sc, lru);
+       //if (inactive_list_is_low(lruvec, is_file_lru(lru),
+       //           memcg, sc, true))
+       //  shrink_active_list(nr_to_scan, lruvec, sc, lru);
        return 0;
    }

@@ -2234,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,

    anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
        lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
-   file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+   file  = //lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
        lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);

    spin_lock_irq(&pgdat->lru_lock);
@@ -2345,7 +2345,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
             sc->priority == DEF_PRIORITY);

    blk_start_plug(&plug);
-   while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+   while (nr[LRU_INACTIVE_ANON] || //nr[LRU_ACTIVE_FILE] ||
                    nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;
@@ -2372,7 +2372,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * stop reclaiming one LRU and reduce the amount scanning
         * proportional to the original scan target.
         */
-       nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+       nr_file = nr[LRU_INACTIVE_FILE] //+ nr[LRU_ACTIVE_FILE]
+           ;
        nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

        /*
@@ -2391,7 +2392,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
            percentage = nr_anon * 100 / scan_target;
        } else {
            unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
-                       targets[LRU_ACTIVE_FILE] + 1;
+                       //targets[LRU_ACTIVE_FILE] + 
+                       1;
            lru = LRU_FILE;
            percentage = nr_file * 100 / scan_target;
        }
@@ -2409,10 +2411,12 @@ static void shrink_node_memcg(struct pgl
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);

+       if (LRU_FILE != lru) { //avoid this block for LRU_ACTIVE_FILE
        lru += LRU_ACTIVE;
        nr_scanned = targets[lru] - nr[lru];
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);
+       }

        scan_adjusted = true;
    }

Unfortunately the above converted tabs into spaces, so if you want the raw patch it's here.

What this patch does is not evict the Active(file) pages when under memory pressure and thus not cause kswapd0 (but seen in iotop as each program itself) to re-read every running process's executable pages each time there's a context switch in order to allow the program to (continue to)run. Thus, a ton of disk thrashing is avoided and the OS does not freeze into a crawl.

The above was tested with kernel 4.18.5 (and now testing 4.18.7) inside Qubes OS 4.0 's dom0(Fedora 25) and all VMs (Fedora 28) that I'm using.

For the first version of this patch, which also works as well(apparently), see the EDIT on the very question that this is an answer of.

UPDATE: After using this patch for a while on an ArchLinux laptop with 16G RAM (minus 512M reserved for integrated graphics card) and no swap(disabled in kernel too) I can say that the system can run out of memory sooner than without the le9d.patch (rev. 3), and so OOM-killer triggers for Xorg or chromium or other when it wouldn't have without the patch. And so as a mitigation, that seems to work for me thus far, I've been running echo 1 > /proc/sys/vm/drop_caches whenever the Active(file) number in /proc/meminfo is over 2G aka 2000000 KB (eg. get number of KB via this code: grep 'Active(file):' /proc/meminfo|tr -d ' '|cut -f2 -d:|sed 's/kB//') and doing this check with a sleep 5 afterwards. But lately in order to compile firefox-hg in /tmp which is tmpfs and which ultimately uses 12G and ensure it doesn't get OOM-killed, I've been using 500000 instead of 2000000 KB. It sure is better than freezing the entire system (ie. when without le9d.patch) which would've happened in this firefox compilation case. Without this check, Active(file) goes no higher than 4G, but that's enough to OOM-kill Xorg if something wants more memory, such as in this firefox compilation case or even when just copying many gigabytes via midnight commander(if I remember this correctly).

2

To answer the question, here's a simple/preliminary patch to not evict Active(file)(as seen in /proc/meminfo) if it's less than 256 MiB, that seems to work ok (no disk thrashing) with linux-stable 5.2.4:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index dbdc46a84f63..7a0b7e32ff45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2445,6 +2445,13 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
            BUG();
        }

+    if (NR_ACTIVE_FILE == lru) {
+      long long kib_active_file_now=global_node_page_state(NR_ACTIVE_FILE) * MAX_NR_ZONES;
+      if (kib_active_file_now <= 256*1024) {
+        nr[lru] = 0; //don't reclaim any Active(file) (see /proc/meminfo) if they are under 256MiB
+        continue;
+      }
+    }
        *lru_pages += size;
        nr[lru] = scan;
    }

Note that some yet-to-be-found regression on kernel 5.3.0-rc4-gd45331b00ddb will cause a system freeze(without disk thrashing, and sysrq will still work) even without this patch.

(any new developments related to this should be happening here.)

1
  • 1
    @MikkoRantalainen gah, so lazy to plug in the other hard disk(well, SSD) and re-enable VM stuff in BIOS so Qubes can run, but I might do it if a mod doesn't mark it eventually. Btw, see the updated patch le9h.patch which allows sysctl option...
    – user11509478
    Commented Aug 16, 2019 at 15:19
2

The memory.min parameter in the cgroups-v2 memory controller should help.

Namely, let me quote:

"Hard memory protection. If the memory usage of a cgroup is within its effective min boundary, the cgroup’s memory won’t be reclaimed under any conditions. If there is no unprotected reclaimable memory available, OOM killer is invoked."

https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html

1
  • How do you use that exactly? I ask because I'm failing to
    – user11509478
    Commented Jul 9, 2019 at 15:37