all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Alexandre Derumier <aderumier@odiso.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH pve-common 1/4] move PVE::LXC::CGroup to PVE::CGroup
Date: Fri, 30 Oct 2020 10:42:14 +0100	[thread overview]
Message-ID: <20201030094217.53131-2-aderumier@odiso.com> (raw)
In-Reply-To: <20201030094217.53131-1-aderumier@odiso.com>

---
 src/Makefile      |   1 +
 src/PVE/CGroup.pm | 582 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 583 insertions(+)
 create mode 100644 src/PVE/CGroup.pm

diff --git a/src/Makefile b/src/Makefile
index 1987d0e..b2a4ac6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,6 +13,7 @@ LIB_SOURCES = \
 	CLIHandler.pm \
 	CalendarEvent.pm \
 	CpuSet.pm \
+	CGroup.pm \
 	Daemon.pm \
 	Exception.pm \
 	INotify.pm \
diff --git a/src/PVE/CGroup.pm b/src/PVE/CGroup.pm
new file mode 100644
index 0000000..7e12af9
--- /dev/null
+++ b/src/PVE/CGroup.pm
@@ -0,0 +1,582 @@
+# cgroup handler
+#
+# This package should deal with figuring out the right cgroup path for a
+# container (via the command socket), reading and writing cgroup values, and
+# handling cgroup v1 & v2 differences.
+#
+# Note that the long term plan is to have resource manage functions instead of
+# dealing with cgroup files on the outside.
+
+package PVE::CGroup;
+
+use strict;
+use warnings;
+
+use IO::File;
+use IO::Select;
+use POSIX qw();
+
+use PVE::ProcFSTools;
+use PVE::Tools qw(
+    file_get_contents
+    file_read_firstline
+);
+
+use PVE::LXC::Command;
+
+# We don't want to do a command socket round trip for every cgroup read/write,
+# so any cgroup function needs to have the container's path cached, so this
+# package has to be instantiated.
+#
+# LXC keeps separate paths by controller (although they're normally all the
+# same, in our # case anyway), so we cache them by controller as well.
+sub new {
+    my ($class, $vmid) = @_;
+
+    my $self = { vmid => $vmid };
+
+    return bless $self, $class;
+}
+
+# Get the v1 controller list.
+#
+# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
+# optional boolean whether a unified (cgroupv2) hierarchy exists.
+#
+# Deprecated: Use `get_cgroup_controllers()` instead.
+sub get_v1_controllers {
+    my $v1 = {};
+    my $v2 = 0;
+    my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
+    while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
+	my $type = $1;
+	if (length($type)) {
+	    $v1->{$_} = 1 foreach split(/,/, $type);
+	} else {
+	    $v2 = 1;
+	}
+    }
+    return wantarray ? ($v1, $v2) : $v1;
+}
+
+# Get the set v2 controller list from the `cgroup.controllers` file.
+my sub get_v2_controllers {
+    my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') }
+	|| eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') };
+    return undef if !defined $v2;
+
+    # It's a simple space separated list:
+    return { map { $_ => 1 } split(/\s+/, $v2) };
+}
+
+my $CGROUP_CONTROLLERS = undef;
+# Get a list of controllers enabled in each cgroup subsystem.
+#
+# This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
+#
+# Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
+# version.
+sub get_cgroup_controllers() {
+    if (!defined($CGROUP_CONTROLLERS)) {
+	my ($v1, undef) = get_v1_controllers();
+	my $v2 = get_v2_controllers();
+
+	$CGROUP_CONTROLLERS = [$v1, $v2];
+    }
+
+    return $CGROUP_CONTROLLERS->@*;
+}
+
+my $CGROUP_MODE = undef;
+# Figure out which cgroup mode we're operating under:
+#
+# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
+# cgroupv2-only environment.
+#
+# NOTE: To fully support a hybrid layout it is better to use functions like
+# `cpuset_controller_path`.
+#
+# This is a function, not a method!
+sub cgroup_mode() {
+    if (!defined($CGROUP_MODE)) {
+	my ($v1, $v2) = get_cgroup_controllers();
+	if (keys %$v1) {
+	    # hybrid or legacy mode
+	    $CGROUP_MODE = 1;
+	} elsif ($v2) {
+	    $CGROUP_MODE = 2;
+	}
+    }
+
+    die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
+    return $CGROUP_MODE;
+}
+
+my $CGROUPV2_PATH = undef;
+sub cgroupv2_base_path() {
+    if (!defined($CGROUPV2_PATH)) {
+	if (cgroup_mode() == 2) {
+	    $CGROUPV2_PATH = '/sys/fs/cgroup';
+	} else {
+	    $CGROUPV2_PATH = '/sys/fs/cgroup/unified';
+	}
+    }
+    return $CGROUPV2_PATH;
+}
+
+# Find a cgroup controller and return its path and version.
+#
+# LXC initializes the unified hierarchy first, so if a controller is
+# available via both we favor cgroupv2 here as well.
+#
+# Returns nothing if the controller is not available.
+sub find_cgroup_controller($) {
+    my ($controller) = @_;
+
+    my ($v1, $v2) = get_cgroup_controllers();
+
+    if (!defined($controller) || $v2->{$controller}) {
+	my $path = cgroupv2_base_path();
+	return wantarray ? ($path, 2) : $path;
+    }
+
+    if (defined($controller) && $v1->{$controller}) {
+	my $path = "/sys/fs/cgroup/$controller";
+	return wantarray ? ($path, 1) : $path;
+    }
+
+    return;
+}
+
+my $CG_PATH_CPUSET = undef;
+my $CG_VER_CPUSET = undef;
+# Find the cpuset cgroup controller.
+#
+# This is a function, not a method!
+sub cpuset_controller_path() {
+    if (!defined($CG_PATH_CPUSET)) {
+	($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset')
+	    or die "failed to find cpuset controller\n";
+    }
+
+    return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
+}
+
+# Get a subdirectory (without the cgroup mount point) for a controller.
+#
+# If `$controller` is `undef`, get the unified (cgroupv2) path.
+#
+# Note that in cgroup v2, lxc uses the activated controller names
+# (`cgroup.controllers` file) as list of controllers for the unified hierarchy,
+# so this returns a result when a `controller` is provided even when using
+# a pure cgroupv2 setup.
+my sub get_subdir {
+    my ($self, $controller, $limiting) = @_;
+
+    my $entry_name = $controller || 'unified';
+    my $entry = ($self->{controllers}->{$entry_name} //= {});
+
+    my $kind = $limiting ? 'limit' : 'ns';
+    my $path = $entry->{$kind};
+
+    return $path if defined $path;
+
+    $path = PVE::LXC::Command::get_cgroup_path(
+	$self->{vmid},
+	$controller,
+	$limiting,
+    ) or return undef;
+
+    # untaint:
+    if ($path =~ /\.\./) {
+	die "lxc returned suspicious path: '$path'\n";
+    }
+    ($path) = ($path =~ /^(.*)$/s);
+
+    $entry->{$kind} = $path;
+
+    return $path;
+}
+
+# Get path and version for a controller.
+#
+# `$controller` may be `undef`, see get_subdir above for details.
+#
+# Returns either just the path, or the path and cgroup version as a tuple.
+sub get_path {
+    my ($self, $controller, $limiting) = @_;
+
+    # Find the controller before querying the lxc monitor via a socket:
+    my ($cgpath, $ver) = find_cgroup_controller($controller)
+	or return undef;
+
+    my $path = get_subdir($self, $controller, $limiting)
+	or return undef;
+
+    $path = "$cgpath/$path";
+    return wantarray ? ($path, $ver) : $path;
+}
+
+# Convenience method to get the path info if the first existing controller.
+#
+# Returns the same as `get_path`.
+sub get_any_path {
+    my ($self, $limiting, @controllers) = @_;
+
+    my ($path, $ver);
+    for my $c (@controllers) {
+	($path, $ver) = $self->get_path($c, $limiting);
+	last if defined $path;
+    }
+    return wantarray ? ($path, $ver) : $path;
+}
+
+# Parse a 'Nested keyed' file:
+#
+# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
+my sub parse_nested_keyed_file($) {
+    my ($data) = @_;
+    my $res = {};
+    foreach my $line (split(/\n/, $data)) {
+	my ($key, @values) = split(/\s+/, $line);
+
+	my $d = ($res->{$key} = {});
+
+	foreach my $value (@values) {
+	    if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
+		$d->{$key} = $value;
+	    } else {
+		warn "bad key=value pair in nested keyed file\n";
+	    }
+	}
+    }
+    return $res;
+}
+
+# Parse a 'Flat keyed' file:
+#
+# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
+my sub parse_flat_keyed_file($) {
+    my ($data) = @_;
+    my $res = {};
+    foreach my $line (split(/\n/, $data)) {
+	if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
+	    $res->{$key} = $value;
+	} else {
+	    warn "bad 'key value' pair in flat keyed file\n";
+	}
+    }
+    return $res;
+}
+
+# Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
+sub get_io_stats {
+    my ($self) = @_;
+
+    my $res = {
+	diskread => 0,
+	diskwrite => 0,
+    };
+
+    # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
+    my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio');
+    if (!defined($path)) {
+	# container not running
+	return undef;
+    } elsif ($ver == 2) {
+	# cgroupv2 environment, io controller enabled
+	my $io_stat = file_get_contents("$path/io.stat");
+
+	my $data = parse_nested_keyed_file($io_stat);
+	foreach my $dev (keys %$data) {
+	    my $dev = $data->{$dev};
+	    if (my $b = $dev->{rbytes}) {
+		$res->{diskread} += $b;
+	    }
+	    if (my $b = $dev->{wbytes}) {
+		$res->{diskread} += $b;
+	    }
+	}
+
+	return $res;
+    } elsif ($ver == 1) {
+	# cgroupv1 environment:
+	my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
+	foreach my $line (split(/\n/, $io)) {
+	    if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
+		$res->{diskread} += $bytes if $type eq 'Read';
+		$res->{diskwrite} += $bytes if $type eq 'Write';
+	    }
+	}
+
+	return $res;
+    } else {
+	die "bad cgroup version: $ver\n";
+    }
+
+    # container not running
+    return undef;
+}
+
+# Read utime and stime for this container from the cpuacct cgroup.
+# Values are in milliseconds!
+sub get_cpu_stat {
+    my ($self) = @_;
+
+    my $res = {
+	utime => 0,
+	stime => 0,
+    };
+
+    my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu');
+    if (!defined($path)) {
+	# container not running
+	return undef;
+    } elsif ($ver == 2) {
+	my $data = eval { file_get_contents("$path/cpu.stat") };
+
+	# or no io controller available:
+	return undef if !defined($data);
+
+	$data = parse_flat_keyed_file($data);
+	$res->{utime} = int($data->{user_usec} / 1000);
+	$res->{stime} = int($data->{system_usec} / 1000);
+    } elsif ($ver == 1) {
+	# cgroupv1 environment:
+	my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
+	my $clk_to_usec = 1000 / $clock_ticks;
+
+	my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
+	$res->{utime} = int($data->{user} * $clk_to_usec);
+	$res->{stime} = int($data->{system} * $clk_to_usec);
+    } else {
+	die "bad cgroup version: $ver\n";
+    }
+
+    return $res;
+}
+
+# Parse some memory data from `memory.stat`
+sub get_memory_stat {
+    my ($self) = @_;
+
+    my $res = {
+	mem => 0,
+	swap => 0,
+    };
+
+    my ($path, $ver) = $self->get_path('memory', 1);
+    if (!defined($path)) {
+	# container most likely isn't running
+	return undef;
+    } elsif ($ver == 2) {
+	my $mem = file_get_contents("$path/memory.current");
+	my $swap = file_get_contents("$path/memory.swap.current");
+
+	chomp ($mem, $swap);
+
+	# FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
+	# the values in `memory.stat`...
+
+	$res->{mem} = $mem;
+	$res->{swap} = $swap;
+    } elsif ($ver == 1) {
+	# cgroupv1 environment:
+	my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
+	my $mem = file_get_contents("$path/memory.usage_in_bytes");
+	my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
+	chomp ($mem, $memsw);
+
+	$res->{mem} = $mem - $stat->{total_cache};
+	$res->{swap} = $memsw - $mem;
+    } else {
+	die "bad cgroup version: $ver\n";
+    }
+
+    return $res;
+}
+
+# Change the memory limit for this container.
+#
+# Dies on error (including a not-running or currently-shutting-down guest).
+sub change_memory_limit {
+    my ($self, $mem_bytes, $swap_bytes) = @_;
+
+    my ($path, $ver) = $self->get_path('memory', 1);
+    if (!defined($path)) {
+	die "trying to change memory cgroup values: container not running\n";
+    } elsif ($ver == 2) {
+	PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
+	    if defined($swap_bytes);
+	PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
+	    if defined($mem_bytes);
+    } elsif ($ver == 1) {
+	# With cgroupv1 we cannot control memory and swap limits separately.
+	# This also means that since the two values aren't independent, we need to handle
+	# growing and shrinking separately.
+	my $path_mem = "$path/memory.limit_in_bytes";
+	my $path_memsw = "$path/memory.memsw.limit_in_bytes";
+
+	my $old_mem_bytes = file_get_contents($path_mem);
+	my $old_memsw_bytes = file_get_contents($path_memsw);
+	chomp($old_mem_bytes, $old_memsw_bytes);
+
+	$mem_bytes //= $old_mem_bytes;
+	$swap_bytes //= $old_memsw_bytes - $old_mem_bytes;
+	my $memsw_bytes = $mem_bytes + $swap_bytes;
+
+	if ($memsw_bytes > $old_memsw_bytes) {
+	    # Growing the limit means growing the combined limit first, then pulling the
+	    # memory limitup.
+	    PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
+	    PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
+	} else {
+	    # Shrinking means we first need to shrink the mem-only memsw cannot be
+	    # shrunk below it.
+	    PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
+	    PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
+	}
+    } else {
+	die "bad cgroup version: $ver\n";
+    }
+
+    # return a truth value
+    return 1;
+}
+
+# Change the cpu quota for a container.
+#
+# Dies on error (including a not-running or currently-shutting-down guest).
+sub change_cpu_quota {
+    my ($self, $quota, $period) = @_;
+
+    die "quota without period not allowed\n" if !defined($period) && defined($quota);
+
+    my ($path, $ver) = $self->get_path('cpu', 1);
+    if (!defined($path)) {
+	die "trying to change cpu quota cgroup values: container not running\n";
+    } elsif ($ver == 2) {
+	# cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
+	# in this interface:
+	$quota //= 'max'; # unlimited
+	if (defined($quota)) {
+	    PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
+	} else {
+	    # we're allowed to only write the quota:
+	    PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
+	}
+    } elsif ($ver == 1) {
+	$quota //= -1; # unlimited
+	$period //= -1;
+	PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
+	PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
+    } else {
+	die "bad cgroup version: $ver\n";
+    }
+
+    # return a truth value
+    return 1;
+}
+
+# Change the cpu "shares" for a container.
+#
+# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
+#
+# In cgroupv2 we do not have "shares", we have "weights" in the range
+# of `[1..10000]` with a default of 100.
+#
+# Since the default values don't match when scaling linearly, we use the
+# values we get as-is and simply error for values >10000 in cgroupv2.
+#
+# It is left to the user to figure this out for now.
+#
+# Dies on error (including a not-running or currently-shutting-down guest).
+sub change_cpu_shares {
+    my ($self, $shares, $cgroupv1_default) = @_;
+
+    my ($path, $ver) = $self->get_path('cpu', 1);
+    if (!defined($path)) {
+	die "trying to change cpu shares/weight cgroup values: container not running\n";
+    } elsif ($ver == 2) {
+	# the cgroupv2 documentation defines the default to 100
+	$shares //= 100;
+	die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
+	PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
+    } elsif ($ver == 1) {
+	$shares //= 100;
+	PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
+    } else {
+	die "bad cgroup version: $ver\n";
+    }
+
+    # return a truth value
+    return 1;
+}
+
+my sub v1_freeze_thaw {
+    my ($self, $controller_path, $freeze) = @_;
+    my $path = get_subdir($self, 'freezer', 1)
+	or die "trying to freeze container: container not running\n";
+    $path = "$controller_path/$path/freezer.state";
+
+    my $data = $freeze ? 'FROZEN' : 'THAWED';
+    PVE::ProcFSTools::write_proc_entry($path, $data);
+
+    # Here we just poll the freezer.state once per second.
+    while (1) {
+	my $state = file_get_contents($path);
+	chomp $state;
+	last if $state eq $data;
+    }
+}
+
+my sub v2_freeze_thaw {
+    my ($self, $controller_path, $freeze) = @_;
+    my $path = get_subdir($self, undef, 1)
+	or die "trying to freeze container: container not running\n";
+    $path = "$controller_path/$path";
+
+    my $desired_state = $freeze ? 1 : 0;
+
+    # cgroupv2 supports poll events on cgroup.events which contains the frozen
+    # state.
+    my $fh = IO::File->new("$path/cgroup.events", 'r')
+	or die "failed to open $path/cgroup.events file: $!\n";
+    my $select = IO::Select->new();
+    $select->add($fh);
+
+    PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state);
+    while (1) {
+	my $data = do {
+	    local $/ = undef;
+	    <$fh>
+	};
+	$data = parse_flat_keyed_file($data);
+	last if $data->{frozen} == $desired_state;
+	my @handles = $select->has_exception();
+	next if !@handles;
+	seek($fh, 0, 0)
+	    or die "failed to rewind cgroup.events file: $!\n";
+    }
+}
+
+# Freeze or unfreeze a container.
+#
+# This will freeze the container at its outer (limiting) cgroup path. We use
+# this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to
+# fetch the cgroup path from contaienrs still running on lxc3.
+sub freeze_thaw {
+    my ($self, $freeze) = @_;
+
+    my $controller_path = find_cgroup_controller('freezer');
+    if (defined($controller_path)) {
+	return v1_freeze_thaw($self, $controller_path, $freeze);
+    } else {
+	# cgroupv2 always has a freezer, there can be both cgv1 and cgv2
+	# freezers, but we'll prefer v1 when it's available as that's what lxc
+	# does as well...
+	return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze);
+    }
+}
+
+1;
-- 
2.20.1




  reply	other threads:[~2020-10-30  9:43 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-30  9:42 [pve-devel] [PATCH pve-common 0/4] add generic CGroup module Alexandre Derumier
2020-10-30  9:42 ` Alexandre Derumier [this message]
2020-10-30  9:42 ` [pve-devel] [PATCH pve-common 2/4] Cgroups: remove specific lxc code Alexandre Derumier
2020-11-05 10:35   ` Wolfgang Bumiller
2020-10-30  9:42 ` [pve-devel] [PATCH pve-common 3/4] bugfix: cpushares : default value is 1024 for cgroup v1 Alexandre Derumier
2020-10-30  9:42 ` [pve-devel] [PATCH pve-common 4/4] systemd: add CPUWeight encoding Alexandre Derumier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201030094217.53131-2-aderumier@odiso.com \
    --to=aderumier@odiso.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal