From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 278246458B for ; Fri, 30 Oct 2020 10:43:00 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id E3BDF11645 for ; Fri, 30 Oct 2020 10:42:29 +0100 (CET) Received: from kvmformation1.odiso.net (globalOdiso.M6Lille.odiso.net [89.248.211.242]) by firstgate.proxmox.com (Proxmox) with ESMTP id AE218115C2 for ; Fri, 30 Oct 2020 10:42:25 +0100 (CET) Received: by kvmformation1.odiso.net (Postfix, from userid 0) id 8C38DB5706F; Fri, 30 Oct 2020 10:42:18 +0100 (CET) From: Alexandre Derumier To: pve-devel@lists.proxmox.com Date: Fri, 30 Oct 2020 10:42:14 +0100 Message-Id: <20201030094217.53131-2-aderumier@odiso.com> X-Mailer: git-send-email 2.20.1 In-Reply-To: <20201030094217.53131-1-aderumier@odiso.com> References: <20201030094217.53131-1-aderumier@odiso.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 1 AWL -0.284 Adjusted score from AWL reputation of From: address HEADER_FROM_DIFFERENT_DOMAINS 0.249 From and EnvelopeFrom 2nd level mail domains are different KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment KAM_LAZY_DOMAIN_SECURITY 1 Sending domain does not have any anti-forgery methods KHOP_HELO_FCRDNS 0.276 Relay HELO differs from its IP's reverse DNS NO_DNS_FOR_FROM 0.379 Envelope sender has no MX or A DNS records SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_NONE 0.001 SPF: sender does not publish an SPF Record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [clihandler.pm, inotify.pm, cgroup.pm, cgroup.events, exception.pm, calendarevent.pm, daemon.pm, cpuset.pm] Subject: [pve-devel] [PATCH pve-common 1/4] move PVE::LXC::CGroup to PVE::CGroup X-BeenThere: pve-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox VE development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 30 Oct 2020 09:43:00 -0000 --- src/Makefile | 1 + src/PVE/CGroup.pm | 582 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 583 insertions(+) create mode 100644 src/PVE/CGroup.pm diff --git a/src/Makefile b/src/Makefile index 1987d0e..b2a4ac6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -13,6 +13,7 @@ LIB_SOURCES = \ CLIHandler.pm \ CalendarEvent.pm \ CpuSet.pm \ + CGroup.pm \ Daemon.pm \ Exception.pm \ INotify.pm \ diff --git a/src/PVE/CGroup.pm b/src/PVE/CGroup.pm new file mode 100644 index 0000000..7e12af9 --- /dev/null +++ b/src/PVE/CGroup.pm @@ -0,0 +1,582 @@ +# cgroup handler +# +# This package should deal with figuring out the right cgroup path for a +# container (via the command socket), reading and writing cgroup values, and +# handling cgroup v1 & v2 differences. +# +# Note that the long term plan is to have resource manage functions instead of +# dealing with cgroup files on the outside. + +package PVE::CGroup; + +use strict; +use warnings; + +use IO::File; +use IO::Select; +use POSIX qw(); + +use PVE::ProcFSTools; +use PVE::Tools qw( + file_get_contents + file_read_firstline +); + +use PVE::LXC::Command; + +# We don't want to do a command socket round trip for every cgroup read/write, +# so any cgroup function needs to have the container's path cached, so this +# package has to be instantiated. +# +# LXC keeps separate paths by controller (although they're normally all the +# same, in our # case anyway), so we cache them by controller as well. +sub new { + my ($class, $vmid) = @_; + + my $self = { vmid => $vmid }; + + return bless $self, $class; +} + +# Get the v1 controller list. +# +# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an +# optional boolean whether a unified (cgroupv2) hierarchy exists. +# +# Deprecated: Use `get_cgroup_controllers()` instead. +sub get_v1_controllers { + my $v1 = {}; + my $v2 = 0; + my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); + while ($data =~ /^\d+:([^:\n]*):.*$/gm) { + my $type = $1; + if (length($type)) { + $v1->{$_} = 1 foreach split(/,/, $type); + } else { + $v2 = 1; + } + } + return wantarray ? ($v1, $v2) : $v1; +} + +# Get the set v2 controller list from the `cgroup.controllers` file. +my sub get_v2_controllers { + my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') } + || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') }; + return undef if !defined $v2; + + # It's a simple space separated list: + return { map { $_ => 1 } split(/\s+/, $v2) }; +} + +my $CGROUP_CONTROLLERS = undef; +# Get a list of controllers enabled in each cgroup subsystem. +# +# This is a more complete version of `PVE::LXC::get_cgroup_subsystems`. +# +# Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup +# version. +sub get_cgroup_controllers() { + if (!defined($CGROUP_CONTROLLERS)) { + my ($v1, undef) = get_v1_controllers(); + my $v2 = get_v2_controllers(); + + $CGROUP_CONTROLLERS = [$v1, $v2]; + } + + return $CGROUP_CONTROLLERS->@*; +} + +my $CGROUP_MODE = undef; +# Figure out which cgroup mode we're operating under: +# +# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a +# cgroupv2-only environment. +# +# NOTE: To fully support a hybrid layout it is better to use functions like +# `cpuset_controller_path`. +# +# This is a function, not a method! +sub cgroup_mode() { + if (!defined($CGROUP_MODE)) { + my ($v1, $v2) = get_cgroup_controllers(); + if (keys %$v1) { + # hybrid or legacy mode + $CGROUP_MODE = 1; + } elsif ($v2) { + $CGROUP_MODE = 2; + } + } + + die "unknown cgroup mode\n" if !defined($CGROUP_MODE); + return $CGROUP_MODE; +} + +my $CGROUPV2_PATH = undef; +sub cgroupv2_base_path() { + if (!defined($CGROUPV2_PATH)) { + if (cgroup_mode() == 2) { + $CGROUPV2_PATH = '/sys/fs/cgroup'; + } else { + $CGROUPV2_PATH = '/sys/fs/cgroup/unified'; + } + } + return $CGROUPV2_PATH; +} + +# Find a cgroup controller and return its path and version. +# +# LXC initializes the unified hierarchy first, so if a controller is +# available via both we favor cgroupv2 here as well. +# +# Returns nothing if the controller is not available. +sub find_cgroup_controller($) { + my ($controller) = @_; + + my ($v1, $v2) = get_cgroup_controllers(); + + if (!defined($controller) || $v2->{$controller}) { + my $path = cgroupv2_base_path(); + return wantarray ? ($path, 2) : $path; + } + + if (defined($controller) && $v1->{$controller}) { + my $path = "/sys/fs/cgroup/$controller"; + return wantarray ? ($path, 1) : $path; + } + + return; +} + +my $CG_PATH_CPUSET = undef; +my $CG_VER_CPUSET = undef; +# Find the cpuset cgroup controller. +# +# This is a function, not a method! +sub cpuset_controller_path() { + if (!defined($CG_PATH_CPUSET)) { + ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset') + or die "failed to find cpuset controller\n"; + } + + return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET; +} + +# Get a subdirectory (without the cgroup mount point) for a controller. +# +# If `$controller` is `undef`, get the unified (cgroupv2) path. +# +# Note that in cgroup v2, lxc uses the activated controller names +# (`cgroup.controllers` file) as list of controllers for the unified hierarchy, +# so this returns a result when a `controller` is provided even when using +# a pure cgroupv2 setup. +my sub get_subdir { + my ($self, $controller, $limiting) = @_; + + my $entry_name = $controller || 'unified'; + my $entry = ($self->{controllers}->{$entry_name} //= {}); + + my $kind = $limiting ? 'limit' : 'ns'; + my $path = $entry->{$kind}; + + return $path if defined $path; + + $path = PVE::LXC::Command::get_cgroup_path( + $self->{vmid}, + $controller, + $limiting, + ) or return undef; + + # untaint: + if ($path =~ /\.\./) { + die "lxc returned suspicious path: '$path'\n"; + } + ($path) = ($path =~ /^(.*)$/s); + + $entry->{$kind} = $path; + + return $path; +} + +# Get path and version for a controller. +# +# `$controller` may be `undef`, see get_subdir above for details. +# +# Returns either just the path, or the path and cgroup version as a tuple. +sub get_path { + my ($self, $controller, $limiting) = @_; + + # Find the controller before querying the lxc monitor via a socket: + my ($cgpath, $ver) = find_cgroup_controller($controller) + or return undef; + + my $path = get_subdir($self, $controller, $limiting) + or return undef; + + $path = "$cgpath/$path"; + return wantarray ? ($path, $ver) : $path; +} + +# Convenience method to get the path info if the first existing controller. +# +# Returns the same as `get_path`. +sub get_any_path { + my ($self, $limiting, @controllers) = @_; + + my ($path, $ver); + for my $c (@controllers) { + ($path, $ver) = $self->get_path($c, $limiting); + last if defined $path; + } + return wantarray ? ($path, $ver) : $path; +} + +# Parse a 'Nested keyed' file: +# +# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. +my sub parse_nested_keyed_file($) { + my ($data) = @_; + my $res = {}; + foreach my $line (split(/\n/, $data)) { + my ($key, @values) = split(/\s+/, $line); + + my $d = ($res->{$key} = {}); + + foreach my $value (@values) { + if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) { + $d->{$key} = $value; + } else { + warn "bad key=value pair in nested keyed file\n"; + } + } + } + return $res; +} + +# Parse a 'Flat keyed' file: +# +# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. +my sub parse_flat_keyed_file($) { + my ($data) = @_; + my $res = {}; + foreach my $line (split(/\n/, $data)) { + if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) { + $res->{$key} = $value; + } else { + warn "bad 'key value' pair in flat keyed file\n"; + } + } + return $res; +} + +# Parse out 'diskread' and 'diskwrite' values from I/O stats for this container. +sub get_io_stats { + my ($self) = @_; + + my $res = { + diskread => 0, + diskwrite => 0, + }; + + # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io': + my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio'); + if (!defined($path)) { + # container not running + return undef; + } elsif ($ver == 2) { + # cgroupv2 environment, io controller enabled + my $io_stat = file_get_contents("$path/io.stat"); + + my $data = parse_nested_keyed_file($io_stat); + foreach my $dev (keys %$data) { + my $dev = $data->{$dev}; + if (my $b = $dev->{rbytes}) { + $res->{diskread} += $b; + } + if (my $b = $dev->{wbytes}) { + $res->{diskread} += $b; + } + } + + return $res; + } elsif ($ver == 1) { + # cgroupv1 environment: + my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive"); + foreach my $line (split(/\n/, $io)) { + if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) { + $res->{diskread} += $bytes if $type eq 'Read'; + $res->{diskwrite} += $bytes if $type eq 'Write'; + } + } + + return $res; + } else { + die "bad cgroup version: $ver\n"; + } + + # container not running + return undef; +} + +# Read utime and stime for this container from the cpuacct cgroup. +# Values are in milliseconds! +sub get_cpu_stat { + my ($self) = @_; + + my $res = { + utime => 0, + stime => 0, + }; + + my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu'); + if (!defined($path)) { + # container not running + return undef; + } elsif ($ver == 2) { + my $data = eval { file_get_contents("$path/cpu.stat") }; + + # or no io controller available: + return undef if !defined($data); + + $data = parse_flat_keyed_file($data); + $res->{utime} = int($data->{user_usec} / 1000); + $res->{stime} = int($data->{system_usec} / 1000); + } elsif ($ver == 1) { + # cgroupv1 environment: + my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); + my $clk_to_usec = 1000 / $clock_ticks; + + my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat")); + $res->{utime} = int($data->{user} * $clk_to_usec); + $res->{stime} = int($data->{system} * $clk_to_usec); + } else { + die "bad cgroup version: $ver\n"; + } + + return $res; +} + +# Parse some memory data from `memory.stat` +sub get_memory_stat { + my ($self) = @_; + + my $res = { + mem => 0, + swap => 0, + }; + + my ($path, $ver) = $self->get_path('memory', 1); + if (!defined($path)) { + # container most likely isn't running + return undef; + } elsif ($ver == 2) { + my $mem = file_get_contents("$path/memory.current"); + my $swap = file_get_contents("$path/memory.swap.current"); + + chomp ($mem, $swap); + + # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up + # the values in `memory.stat`... + + $res->{mem} = $mem; + $res->{swap} = $swap; + } elsif ($ver == 1) { + # cgroupv1 environment: + my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat")); + my $mem = file_get_contents("$path/memory.usage_in_bytes"); + my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes"); + chomp ($mem, $memsw); + + $res->{mem} = $mem - $stat->{total_cache}; + $res->{swap} = $memsw - $mem; + } else { + die "bad cgroup version: $ver\n"; + } + + return $res; +} + +# Change the memory limit for this container. +# +# Dies on error (including a not-running or currently-shutting-down guest). +sub change_memory_limit { + my ($self, $mem_bytes, $swap_bytes) = @_; + + my ($path, $ver) = $self->get_path('memory', 1); + if (!defined($path)) { + die "trying to change memory cgroup values: container not running\n"; + } elsif ($ver == 2) { + PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes) + if defined($swap_bytes); + PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes) + if defined($mem_bytes); + } elsif ($ver == 1) { + # With cgroupv1 we cannot control memory and swap limits separately. + # This also means that since the two values aren't independent, we need to handle + # growing and shrinking separately. + my $path_mem = "$path/memory.limit_in_bytes"; + my $path_memsw = "$path/memory.memsw.limit_in_bytes"; + + my $old_mem_bytes = file_get_contents($path_mem); + my $old_memsw_bytes = file_get_contents($path_memsw); + chomp($old_mem_bytes, $old_memsw_bytes); + + $mem_bytes //= $old_mem_bytes; + $swap_bytes //= $old_memsw_bytes - $old_mem_bytes; + my $memsw_bytes = $mem_bytes + $swap_bytes; + + if ($memsw_bytes > $old_memsw_bytes) { + # Growing the limit means growing the combined limit first, then pulling the + # memory limitup. + PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); + PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); + } else { + # Shrinking means we first need to shrink the mem-only memsw cannot be + # shrunk below it. + PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); + PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); + } + } else { + die "bad cgroup version: $ver\n"; + } + + # return a truth value + return 1; +} + +# Change the cpu quota for a container. +# +# Dies on error (including a not-running or currently-shutting-down guest). +sub change_cpu_quota { + my ($self, $quota, $period) = @_; + + die "quota without period not allowed\n" if !defined($period) && defined($quota); + + my ($path, $ver) = $self->get_path('cpu', 1); + if (!defined($path)) { + die "trying to change cpu quota cgroup values: container not running\n"; + } elsif ($ver == 2) { + # cgroupv2 environment, an undefined (unlimited) quota is defined as "max" + # in this interface: + $quota //= 'max'; # unlimited + if (defined($quota)) { + PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period"); + } else { + # we're allowed to only write the quota: + PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max'); + } + } elsif ($ver == 1) { + $quota //= -1; # unlimited + $period //= -1; + PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period); + PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota); + } else { + die "bad cgroup version: $ver\n"; + } + + # return a truth value + return 1; +} + +# Change the cpu "shares" for a container. +# +# In cgroupv1 we used a value in `[0..500000]` with a default of 1024. +# +# In cgroupv2 we do not have "shares", we have "weights" in the range +# of `[1..10000]` with a default of 100. +# +# Since the default values don't match when scaling linearly, we use the +# values we get as-is and simply error for values >10000 in cgroupv2. +# +# It is left to the user to figure this out for now. +# +# Dies on error (including a not-running or currently-shutting-down guest). +sub change_cpu_shares { + my ($self, $shares, $cgroupv1_default) = @_; + + my ($path, $ver) = $self->get_path('cpu', 1); + if (!defined($path)) { + die "trying to change cpu shares/weight cgroup values: container not running\n"; + } elsif ($ver == 2) { + # the cgroupv2 documentation defines the default to 100 + $shares //= 100; + die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000; + PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares); + } elsif ($ver == 1) { + $shares //= 100; + PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default); + } else { + die "bad cgroup version: $ver\n"; + } + + # return a truth value + return 1; +} + +my sub v1_freeze_thaw { + my ($self, $controller_path, $freeze) = @_; + my $path = get_subdir($self, 'freezer', 1) + or die "trying to freeze container: container not running\n"; + $path = "$controller_path/$path/freezer.state"; + + my $data = $freeze ? 'FROZEN' : 'THAWED'; + PVE::ProcFSTools::write_proc_entry($path, $data); + + # Here we just poll the freezer.state once per second. + while (1) { + my $state = file_get_contents($path); + chomp $state; + last if $state eq $data; + } +} + +my sub v2_freeze_thaw { + my ($self, $controller_path, $freeze) = @_; + my $path = get_subdir($self, undef, 1) + or die "trying to freeze container: container not running\n"; + $path = "$controller_path/$path"; + + my $desired_state = $freeze ? 1 : 0; + + # cgroupv2 supports poll events on cgroup.events which contains the frozen + # state. + my $fh = IO::File->new("$path/cgroup.events", 'r') + or die "failed to open $path/cgroup.events file: $!\n"; + my $select = IO::Select->new(); + $select->add($fh); + + PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state); + while (1) { + my $data = do { + local $/ = undef; + <$fh> + }; + $data = parse_flat_keyed_file($data); + last if $data->{frozen} == $desired_state; + my @handles = $select->has_exception(); + next if !@handles; + seek($fh, 0, 0) + or die "failed to rewind cgroup.events file: $!\n"; + } +} + +# Freeze or unfreeze a container. +# +# This will freeze the container at its outer (limiting) cgroup path. We use +# this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to +# fetch the cgroup path from contaienrs still running on lxc3. +sub freeze_thaw { + my ($self, $freeze) = @_; + + my $controller_path = find_cgroup_controller('freezer'); + if (defined($controller_path)) { + return v1_freeze_thaw($self, $controller_path, $freeze); + } else { + # cgroupv2 always has a freezer, there can be both cgv1 and cgv2 + # freezers, but we'll prefer v1 when it's available as that's what lxc + # does as well... + return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze); + } +} + +1; -- 2.20.1