diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index d6e18ba2e..dacfb0fb6 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -211,3 +211,20 @@ jobs: - name: Generate documentation run: tox -m docs + + isort: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: 3.12 + cache: pip + + - name: isort + uses: isort/isort-action@master + with: + requirementsFiles: "requirements.txt requirements.dev.txt requirements.docs.txt" + sort-paths: "patroni tests features setup.py" diff --git a/docs/ENVIRONMENT.rst b/docs/ENVIRONMENT.rst index d6c40116f..ce2a5df8a 100644 --- a/docs/ENVIRONMENT.rst +++ b/docs/ENVIRONMENT.rst @@ -28,6 +28,7 @@ Log - **PATRONI\_LOG\_STATIC\_FIELDS**: add additional fields to the log. This option is only available when the log type is set to **json**. Example ``PATRONI_LOG_STATIC_FIELDS="{app: patroni}"`` - **PATRONI\_LOG\_MAX\_QUEUE\_SIZE**: Patroni is using two-step logging. Log records are written into the in-memory queue and there is a separate thread which pulls them from the queue and writes to stderr or file. The maximum size of the internal queue is limited by default by **1000** records, which is enough to keep logs for the past 1h20m. - **PATRONI\_LOG\_DIR**: Directory to write application logs to. The directory must exist and be writable by the user executing Patroni. If you set this env variable, the application will retain 4 25MB logs by default. You can tune those retention values with `PATRONI_LOG_FILE_NUM` and `PATRONI_LOG_FILE_SIZE` (see below). +- **PATRONI\_LOG\_MODE**: Permissions for log files (for example, ``0644``). If not specified, permissions will be set based on the current umask value. - **PATRONI\_LOG\_FILE\_NUM**: The number of application logs to retain. - **PATRONI\_LOG\_FILE\_SIZE**: Size of patroni.log file (in bytes) that triggers a log rolling. - **PATRONI\_LOG\_LOGGERS**: Redefine logging level per python module. Example ``PATRONI_LOG_LOGGERS="{patroni.postmaster: WARNING, urllib3: DEBUG}"`` diff --git a/docs/citus.rst b/docs/citus.rst index f404f47e9..24ad6ffb2 100644 --- a/docs/citus.rst +++ b/docs/citus.rst @@ -34,6 +34,8 @@ There are only a few simple rules you need to follow: After that you just need to start Patroni and it will handle the rest: +0. Patroni will set ``bootstrap.dcs.synchronous_mode`` to :ref:`quorum ` + if it is not explicitly set to any other value. 1. ``citus`` extension will be automatically added to ``shared_preload_libraries``. 2. If ``max_prepared_transactions`` isn't explicitly set in the global :ref:`dynamic configuration ` Patroni will @@ -77,36 +79,36 @@ It results in two major differences in :ref:`patronictl` behaviour when An example of :ref:`patronictl_list` output for the Citus cluster:: postgres@coord1:~$ patronictl list demo - + Citus cluster: demo ----------+--------------+---------+----+-----------+ - | Group | Member | Host | Role | State | TL | Lag in MB | - +-------+---------+-------------+--------------+---------+----+-----------+ - | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | - | 0 | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | - | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | - | 1 | work1-1 | 172.27.0.8 | Sync Standby | running | 1 | 0 | - | 1 | work1-2 | 172.27.0.2 | Leader | running | 1 | | - | 2 | work2-1 | 172.27.0.5 | Sync Standby | running | 1 | 0 | - | 2 | work2-2 | 172.27.0.7 | Leader | running | 1 | | - +-------+---------+-------------+--------------+---------+----+-----------+ + + Citus cluster: demo ----------+----------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+----------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | 0 | coord2 | 172.27.0.6 | Quorum Standby | running | 1 | 0 | + | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | + | 1 | work1-1 | 172.27.0.8 | Quorum Standby | running | 1 | 0 | + | 1 | work1-2 | 172.27.0.2 | Leader | running | 1 | | + | 2 | work2-1 | 172.27.0.5 | Quorum Standby | running | 1 | 0 | + | 2 | work2-2 | 172.27.0.7 | Leader | running | 1 | | + +-------+---------+-------------+----------------+---------+----+-----------+ If we add the ``--group`` option, the output will change to:: postgres@coord1:~$ patronictl list demo --group 0 - + Citus cluster: demo (group: 0, 7179854923829112860) -----------+ - | Member | Host | Role | State | TL | Lag in MB | - +--------+-------------+--------------+---------+----+-----------+ - | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | - | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | - | coord3 | 172.27.0.4 | Leader | running | 1 | | - +--------+-------------+--------------+---------+----+-----------+ + + Citus cluster: demo (group: 0, 7179854923829112860) -+-----------+ + | Member | Host | Role | State | TL | Lag in MB | + +--------+-------------+----------------+---------+----+-----------+ + | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | coord2 | 172.27.0.6 | Quorum Standby | running | 1 | 0 | + | coord3 | 172.27.0.4 | Leader | running | 1 | | + +--------+-------------+----------------+---------+----+-----------+ postgres@coord1:~$ patronictl list demo --group 1 - + Citus cluster: demo (group: 1, 7179854923881963547) -----------+ - | Member | Host | Role | State | TL | Lag in MB | - +---------+------------+--------------+---------+----+-----------+ - | work1-1 | 172.27.0.8 | Sync Standby | running | 1 | 0 | - | work1-2 | 172.27.0.2 | Leader | running | 1 | | - +---------+------------+--------------+---------+----+-----------+ + + Citus cluster: demo (group: 1, 7179854923881963547) -+-----------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+------------+----------------+---------+----+-----------+ + | work1-1 | 172.27.0.8 | Quorum Standby | running | 1 | 0 | + | work1-2 | 172.27.0.2 | Leader | running | 1 | | + +---------+------------+----------------+---------+----+-----------+ Citus worker switchover ----------------------- @@ -122,28 +124,28 @@ new primary worker node is ready to accept read-write queries. An example of :ref:`patronictl_switchover` on the worker cluster:: postgres@coord1:~$ patronictl switchover demo - + Citus cluster: demo ----------+--------------+---------+----+-----------+ - | Group | Member | Host | Role | State | TL | Lag in MB | - +-------+---------+-------------+--------------+---------+----+-----------+ - | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | - | 0 | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | - | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | - | 1 | work1-1 | 172.27.0.8 | Leader | running | 1 | | - | 1 | work1-2 | 172.27.0.2 | Sync Standby | running | 1 | 0 | - | 2 | work2-1 | 172.27.0.5 | Sync Standby | running | 1 | 0 | - | 2 | work2-2 | 172.27.0.7 | Leader | running | 1 | | - +-------+---------+-------------+--------------+---------+----+-----------+ + + Citus cluster: demo ----------+----------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+----------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | 0 | coord2 | 172.27.0.6 | Quorum Standby | running | 1 | 0 | + | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | + | 1 | work1-1 | 172.27.0.8 | Leader | running | 1 | | + | 1 | work1-2 | 172.27.0.2 | Quorum Standby | running | 1 | 0 | + | 2 | work2-1 | 172.27.0.5 | Quorum Standby | running | 1 | 0 | + | 2 | work2-2 | 172.27.0.7 | Leader | running | 1 | | + +-------+---------+-------------+----------------+---------+----+-----------+ Citus group: 2 Primary [work2-2]: Candidate ['work2-1'] []: When should the switchover take place (e.g. 2022-12-22T08:02 ) [now]: Current cluster topology - + Citus cluster: demo (group: 2, 7179854924063375386) -----------+ - | Member | Host | Role | State | TL | Lag in MB | - +---------+------------+--------------+---------+----+-----------+ - | work2-1 | 172.27.0.5 | Sync Standby | running | 1 | 0 | - | work2-2 | 172.27.0.7 | Leader | running | 1 | | - +---------+------------+--------------+---------+----+-----------+ + + Citus cluster: demo (group: 2, 7179854924063375386) -+-----------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+------------+----------------+---------+----+-----------+ + | work2-1 | 172.27.0.5 | Quorum Standby | running | 1 | 0 | + | work2-2 | 172.27.0.7 | Leader | running | 1 | | + +---------+------------+----------------+---------+----+-----------+ Are you sure you want to switchover cluster demo, demoting current primary work2-2? [y/N]: y 2022-12-22 07:02:40.33003 Successfully switched over to "work2-1" + Citus cluster: demo (group: 2, 7179854924063375386) ------+ @@ -154,17 +156,17 @@ An example of :ref:`patronictl_switchover` on the worker cluster:: +---------+------------+---------+---------+----+-----------+ postgres@coord1:~$ patronictl list demo - + Citus cluster: demo ----------+--------------+---------+----+-----------+ - | Group | Member | Host | Role | State | TL | Lag in MB | - +-------+---------+-------------+--------------+---------+----+-----------+ - | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | - | 0 | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | - | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | - | 1 | work1-1 | 172.27.0.8 | Leader | running | 1 | | - | 1 | work1-2 | 172.27.0.2 | Sync Standby | running | 1 | 0 | - | 2 | work2-1 | 172.27.0.5 | Leader | running | 2 | | - | 2 | work2-2 | 172.27.0.7 | Sync Standby | running | 2 | 0 | - +-------+---------+-------------+--------------+---------+----+-----------+ + + Citus cluster: demo ----------+----------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+----------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | 0 | coord2 | 172.27.0.6 | Quorum Standby | running | 1 | 0 | + | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | + | 1 | work1-1 | 172.27.0.8 | Leader | running | 1 | | + | 1 | work1-2 | 172.27.0.2 | Quorum Standby | running | 1 | 0 | + | 2 | work2-1 | 172.27.0.5 | Leader | running | 2 | | + | 2 | work2-2 | 172.27.0.7 | Quorum Standby | running | 2 | 0 | + +-------+---------+-------------+----------------+---------+----+-----------+ And this is how it looks on the coordinator side:: diff --git a/docs/dynamic_configuration.rst b/docs/dynamic_configuration.rst index 44cf4c2b9..1990c9404 100644 --- a/docs/dynamic_configuration.rst +++ b/docs/dynamic_configuration.rst @@ -25,8 +25,9 @@ In order to change the dynamic configuration you can use either :ref:`patronictl - **max\_timelines\_history**: maximum number of timeline history items kept in DCS. Default value: 0. When set to 0, it keeps the full history in DCS. - **primary\_start\_timeout**: the amount of time a primary is allowed to recover from failures before failover is triggered (in seconds). Default is 300 seconds. When set to 0 failover is done immediately after a crash is detected if possible. When using asynchronous replication a failover can cause lost transactions. Worst case failover time for primary failure is: loop\_wait + primary\_start\_timeout + loop\_wait, unless primary\_start\_timeout is zero, in which case it's just loop\_wait. Set the value according to your durability/availability tradeoff. - **primary\_stop\_timeout**: The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled. When set to > 0 and the synchronous_mode is enabled, Patroni sends SIGKILL to the postmaster if the stop operation is running for more than the value set by primary\_stop\_timeout. Set the value according to your durability/availability tradeoff. If the parameter is not set or set <= 0, primary\_stop\_timeout does not apply. -- **synchronous\_mode**: turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election. Synchronous mode makes sure that successfully committed transactions will not be lost at failover, at the cost of losing availability for writes when Patroni cannot ensure transaction durability. See :ref:`replication modes documentation ` for details. +- **synchronous\_mode**: turns on synchronous replication mode. Possible values: ``off``, ``on``, ``quorum``. In this mode the leader takes care of management of ``synchronous_standby_names``, and only the last known leader, or one of synchronous replicas, are allowed to participate in leader race. Synchronous mode makes sure that successfully committed transactions will not be lost at failover, at the cost of losing availability for writes when Patroni cannot ensure transaction durability. See :ref:`replication modes documentation ` for details. - **synchronous\_mode\_strict**: prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the primary. See :ref:`replication modes documentation ` for details. +- **synchronous\_node\_count**: if ``synchronous_mode`` is enabled, this parameter is used by Patroni to manage the precise number of synchronous standby instances and adjusts the state in DCS and the ``synchronous_standby_names`` parameter in PostgreSQL as members join and leave. If the parameter is set to a value higher than the number of eligible nodes, it will be automatically adjusted. Defaults to ``1``. - **failsafe\_mode**: Enables :ref:`DCS Failsafe Mode `. Defaults to `false`. - **postgresql**: diff --git a/docs/replication_modes.rst b/docs/replication_modes.rst index 017abf15e..27a75ce74 100644 --- a/docs/replication_modes.rst +++ b/docs/replication_modes.rst @@ -6,8 +6,9 @@ Replication modes Patroni uses PostgreSQL streaming replication. For more information about streaming replication, see the `Postgres documentation `__. By default Patroni configures PostgreSQL for asynchronous replication. Choosing your replication schema is dependent on your business considerations. Investigate both async and sync replication, as well as other HA solutions, to determine which solution is best for you. + Asynchronous mode durability ----------------------------- +============================ In asynchronous mode the cluster is allowed to lose some committed transactions to ensure availability. When the primary server fails or becomes unavailable for any other reason Patroni will automatically promote a sufficiently healthy standby to primary. Any transactions that have not been replicated to that standby remain in a "forked timeline" on the primary, and are effectively unrecoverable [1]_. @@ -15,10 +16,11 @@ The amount of transactions that can be lost is controlled via ``maximum_lag_on_f By default, when running leader elections, Patroni does not take into account the current timeline of replicas, what in some cases could be undesirable behavior. You can prevent the node not having the same timeline as a former primary become the new leader by changing the value of ``check_timeline`` parameter to ``true``. + PostgreSQL synchronous replication ----------------------------------- +================================== -You can use Postgres's `synchronous replication `__ with Patroni. Synchronous replication ensures consistency across a cluster by confirming that writes are written to a secondary before returning to the connecting client with a success. The cost of synchronous replication: reduced throughput on writes. This throughput will be entirely based on network performance. +You can use Postgres's `synchronous replication `__ with Patroni. Synchronous replication ensures consistency across a cluster by confirming that writes are written to a secondary before returning to the connecting client with a success. The cost of synchronous replication: increased latency and reduced throughput on writes. This throughput will be entirely based on network performance. In hosted datacenter environments (like AWS, Rackspace, or any network you do not control), synchronous replication significantly increases the variability of write performance. If followers become inaccessible from the leader, the leader effectively becomes read-only. @@ -33,10 +35,11 @@ When using PostgreSQL synchronous replication, use at least three Postgres data Using PostgreSQL synchronous replication does not guarantee zero lost transactions under all circumstances. When the primary and the secondary that is currently acting as a synchronous replica fail simultaneously a third node that might not contain all transactions will be promoted. + .. _synchronous_mode: Synchronous mode ----------------- +================ For use cases where losing committed transactions is not permissible you can turn on Patroni's ``synchronous_mode``. When ``synchronous_mode`` is turned on Patroni will not promote a standby unless it is certain that the standby contains all transactions that may have returned a successful commit status to client [2]_. This means that the system may be unavailable for writes even though some servers are available. System administrators can still use manual failover commands to promote a standby even if it results in transaction loss. @@ -55,28 +58,122 @@ up. You can ensure that a standby never becomes the synchronous standby by setting ``nosync`` tag to true. This is recommended to set for standbys that are behind slow network connections and would cause performance degradation when becoming a synchronous standby. Setting tag ``nostream`` to true will also have the same effect. -Synchronous mode can be switched on and off via Patroni REST interface. See :ref:`dynamic configuration ` for instructions. +Synchronous mode can be switched on and off using ``patronictl edit-config`` command or via Patroni REST interface. See :ref:`dynamic configuration ` for instructions. Note: Because of the way synchronous replication is implemented in PostgreSQL it is still possible to lose transactions even when using ``synchronous_mode_strict``. If the PostgreSQL backend is cancelled while waiting to acknowledge replication (as a result of packet cancellation due to client timeout or backend failure) transaction changes become visible for other backends. Such changes are not yet replicated and may be lost in case of standby promotion. + Synchronous Replication Factor ------------------------------- -The parameter ``synchronous_node_count`` is used by Patroni to manage number of synchronous standby databases. It is set to 1 by default. It has no effect when ``synchronous_mode`` is set to off. When enabled, Patroni manages precise number of synchronous standby databases based on parameter ``synchronous_node_count`` and adjusts the state in DCS & synchronous_standby_names as members join and leave. +============================== + +The parameter ``synchronous_node_count`` is used by Patroni to manage the number of synchronous standby databases. It is set to ``1`` by default. It has no effect when ``synchronous_mode`` is set to ``off``. When enabled, Patroni manages the precise number of synchronous standby databases based on parameter ``synchronous_node_count`` and adjusts the state in DCS & ``synchronous_standby_names`` in PostgreSQL as members join and leave. If the parameter is set to a value higher than the number of eligible nodes it will be automatically reduced by Patroni. + + +Maximum lag on synchronous node +=============================== + +By default Patroni sticks to nodes that are declared as ``synchronous``, according to the ``pg_stat_replication`` view, even when there are other nodes ahead of it. This is done to minimize the number of changes of ``synchronous_standby_names``. To change this behavior one may use ``maximum_lag_on_syncnode`` parameter. It controls how much lag the replica can have to still be considered as "synchronous". + +Patroni utilizes the max replica LSN if there is more than one standby, otherwise it will use leader's current wal LSN. The default is ``-1``, and Patroni will not take action to swap a synchronous unhealthy standby when the value is set to ``0`` or less. Please set the value high enough so that Patroni won't swap synchronous standbys frequently during high transaction volume. + Synchronous mode implementation -------------------------------- +=============================== -When in synchronous mode Patroni maintains synchronization state in the DCS, containing the latest primary and current synchronous standby databases. This state is updated with strict ordering constraints to ensure the following invariants: +When in synchronous mode Patroni maintains synchronization state in the DCS (``/sync`` key), containing the latest primary and current synchronous standby databases. This state is updated with strict ordering constraints to ensure the following invariants: - A node must be marked as the latest leader whenever it can accept write transactions. Patroni crashing or PostgreSQL not shutting down can cause violations of this invariant. -- A node must be set as the synchronous standby in PostgreSQL as long as it is published as the synchronous standby. +- A node must be set as the synchronous standby in PostgreSQL as long as it is published as the synchronous standby in the ``/sync`` key in DCS.. - A node that is not the leader or current synchronous standby is not allowed to promote itself automatically. Patroni will only assign one or more synchronous standby nodes based on ``synchronous_node_count`` parameter to ``synchronous_standby_names``. -On each HA loop iteration Patroni re-evaluates synchronous standby nodes choice. If the current list of synchronous standby nodes are connected and has not requested its synchronous status to be removed it remains picked. Otherwise the cluster member available for sync that is furthest ahead in replication is picked. +On each HA loop iteration Patroni re-evaluates synchronous standby nodes choice. If the current list of synchronous standby nodes are connected and has not requested its synchronous status to be removed it remains picked. Otherwise the cluster members available for sync that are furthest ahead in replication are picked. + +Example: +--------- + +``/config`` key in DCS +^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: YAML + + synchronous_mode: on + synchronous_node_count: 2 + ... + +``/sync`` key in DCS +^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: JSON + + { + "leader": "node0", + "sync_standby": "node1,node2" + } + +postgresql.conf +^^^^^^^^^^^^^^^ + +.. code-block:: INI + + synchronous_standby_names = 'FIRST 2 (node1,node2)' + + +In the above examples only nodes ``node1`` and ``node2`` are known to be synchronous and allowed to be automatically promoted if the primary (``node0``) fails. + + +.. _quorum_mode: + +Quorum commit mode +================== + +Starting from PostgreSQL v10 Patroni supports quorum-based synchronous replication. + +In this mode, Patroni maintains synchronization state in the DCS, containing the latest known primary, the number of nodes required for quorum, and the nodes currently eligible to vote on quorum. In steady state, the nodes voting on quorum are the leader and all synchronous standbys. This state is updated with strict ordering constraints, with regards to node promotion and ``synchronous_standby_names``, to ensure that at all times any subset of voters that can achieve quorum includes at least one node with the latest successful commit. + +On each iteration of HA loop, Patroni re-evaluates synchronous standby choices and quorum, based on node availability and requested cluster configuration. In PostgreSQL versions above 9.6 all eligible nodes are added as synchronous standbys as soon as their replication catches up to leader. + +Quorum commit helps to reduce worst case latencies, even during normal operation, as a higher latency of replicating to one standby can be compensated by other standbys. + +The quorum-based synchronous mode could be enabled by setting ``synchronous_mode`` to ``quorum`` using ``patronictl edit-config`` command or via Patroni REST interface. See :ref:`dynamic configuration ` for instructions. + +Other parameters, like ``synchronous_node_count``, ``maximum_lag_on_syncnode``, and ``synchronous_mode_strict`` continue to work the same way as with ``synchronous_mode=on``. + +Example: +--------- + +``/config`` key in DCS +^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: YAML + + synchronous_mode: quorum + synchronous_node_count: 2 + ... + +``/sync`` key in DCS +^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: JSON + + { + "leader": "node0", + "sync_standby": "node1,node2,node3", + "quorum": 1 + } + +postgresql.conf +^^^^^^^^^^^^^^^ + +.. code-block:: INI + + synchronous_standby_names = 'ANY 2 (node1,node2,node3)' + + +If the primary (``node0``) failed, in the above example two of the ``node1``, ``node2``, ``node3`` will have the latest transaction received, but we don't know which ones. To figure out whether the node ``node1`` has received the latest transaction, we need to compare its LSN with the LSN on **at least** one node (``quorum=1`` in the ``/sync`` key) among ``node2`` and ``node3``. If ``node1`` isn't behind of at least one of them, we can guarantee that there will be no user visible data loss if ``node1`` is promoted. .. [1] The data is still there, but recovering it requires a manual recovery effort by data recovery specialists. When Patroni is allowed to rewind with ``use_pg_rewind`` the forked timeline will be automatically erased to rejoin the failed primary with the cluster. However, for ``use_pg_rewind`` to function properly, either the cluster must be initialized with ``data page checksums`` (``--data-checksums`` option for ``initdb``) and/or ``wal_log_hints`` must be set to ``on``. diff --git a/docs/rest_api.rst b/docs/rest_api.rst index a03c05d23..ef16de618 100644 --- a/docs/rest_api.rst +++ b/docs/rest_api.rst @@ -45,6 +45,10 @@ For all health check ``GET`` requests Patroni returns a JSON document with the s - ``GET /read-only-sync``: like the above endpoint, but also includes the primary. +- ``GET /quorum``: returns HTTP status code **200** only when this Patroni node is listed as a quorum node in ``synchronous_standby_names`` on the primary. + +- ``GET /read-only-quorum``: like the above endpoint, but also includes the primary. + - ``GET /asynchronous`` or ``GET /async``: returns HTTP status code **200** only when the Patroni node is running as an asynchronous standby. @@ -308,6 +312,9 @@ Retrieve the Patroni metrics in Prometheus format through the ``GET /metrics`` e # HELP patroni_sync_standby Value is 1 if this node is a sync standby replica, 0 otherwise. # TYPE patroni_sync_standby gauge patroni_sync_standby{scope="batman",name="patroni1"} 0 + # HELP patroni_quorum_standby Value is 1 if this node is a quorum standby replica, 0 otherwise. + # TYPE patroni_quorum_standby gauge + patroni_quorum_standby{scope="batman",name="patroni1"} 0 # HELP patroni_xlog_received_location Current location of the received Postgres transaction log, 0 if this node is not a replica. # TYPE patroni_xlog_received_location counter patroni_xlog_received_location{scope="batman",name="patroni1"} 0 diff --git a/docs/yaml_configuration.rst b/docs/yaml_configuration.rst index d90a43fce..d11e5368e 100644 --- a/docs/yaml_configuration.rst +++ b/docs/yaml_configuration.rst @@ -29,6 +29,7 @@ Log - **static_fields**: add additional fields to the log. This option is only available when the log type is set to **json**. - **max\_queue\_size**: Patroni is using two-step logging. Log records are written into the in-memory queue and there is a separate thread which pulls them from the queue and writes to stderr or file. The maximum size of the internal queue is limited by default by **1000** records, which is enough to keep logs for the past 1h20m. - **dir**: Directory to write application logs to. The directory must exist and be writable by the user executing Patroni. If you set this value, the application will retain 4 25MB logs by default. You can tune those retention values with `file_num` and `file_size` (see below). +- **mode**: Permissions for log files (for example, ``0644``). If not specified, permissions will be set based on the current umask value. - **file\_num**: The number of application logs to retain. - **file\_size**: Size of patroni.log file (in bytes) that triggers a log rolling. - **loggers**: This section allows redefining logging level per python module diff --git a/features/archive-restore.py b/features/archive-restore.py index 91c216db2..8b02154cb 100644 --- a/features/archive-restore.py +++ b/features/archive-restore.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -import os import argparse +import os import shutil if __name__ == "__main__": diff --git a/features/callback2.py b/features/callback2.py index 6dad933ae..1afcc14c7 100755 --- a/features/callback2.py +++ b/features/callback2.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import sys + with open("data/{0}/{0}_cb.log".format(sys.argv[1]), "a+") as log: log.write(" ".join(sys.argv[-3:]) + "\n") diff --git a/features/citus.feature b/features/citus.feature index b23eb2b2c..16afc34d0 100644 --- a/features/citus.feature +++ b/features/citus.feature @@ -11,7 +11,9 @@ Feature: citus Then replication works from postgres0 to postgres1 after 15 seconds Then replication works from postgres2 to postgres3 after 15 seconds And postgres0 is registered in the postgres0 as the primary in group 0 after 5 seconds + And postgres1 is registered in the postgres0 as the secondary in group 0 after 5 seconds And postgres2 is registered in the postgres0 as the primary in group 1 after 5 seconds + And postgres3 is registered in the postgres0 as the secondary in group 1 after 5 seconds Scenario: coordinator failover updates pg_dist_node Given I run patronictl.py failover batman --group 0 --candidate postgres1 --force @@ -19,11 +21,13 @@ Feature: citus And "members/postgres0" key in a group 0 in DCS has state=running after 15 seconds And replication works from postgres1 to postgres0 after 15 seconds And postgres1 is registered in the postgres2 as the primary in group 0 after 5 seconds + And postgres0 is registered in the postgres2 as the secondary in group 0 after 15 seconds And "sync" key in a group 0 in DCS has sync_standby=postgres0 after 15 seconds When I run patronictl.py switchover batman --group 0 --candidate postgres0 --force Then postgres0 role is the primary after 10 seconds And replication works from postgres0 to postgres1 after 15 seconds And postgres0 is registered in the postgres2 as the primary in group 0 after 5 seconds + And postgres1 is registered in the postgres2 as the secondary in group 0 after 15 seconds And "sync" key in a group 0 in DCS has sync_standby=postgres1 after 15 seconds Scenario: worker switchover doesn't break client queries on the coordinator @@ -35,6 +39,7 @@ Feature: citus And "members/postgres2" key in a group 1 in DCS has state=running after 15 seconds And replication works from postgres3 to postgres2 after 15 seconds And postgres3 is registered in the postgres0 as the primary in group 1 after 5 seconds + And postgres2 is registered in the postgres0 as the secondary in group 1 after 15 seconds And "sync" key in a group 1 in DCS has sync_standby=postgres2 after 15 seconds And a thread is still alive When I run patronictl.py switchover batman --group 1 --force @@ -42,6 +47,7 @@ Feature: citus And postgres2 role is the primary after 10 seconds And replication works from postgres2 to postgres3 after 15 seconds And postgres2 is registered in the postgres0 as the primary in group 1 after 5 seconds + And postgres3 is registered in the postgres0 as the secondary in group 1 after 15 seconds And "sync" key in a group 1 in DCS has sync_standby=postgres3 after 15 seconds And a thread is still alive When I stop a thread @@ -55,6 +61,7 @@ Feature: citus And postgres2 role is the primary after 10 seconds And replication works from postgres2 to postgres3 after 15 seconds And postgres2 is registered in the postgres0 as the primary in group 1 after 5 seconds + And postgres3 is registered in the postgres0 as the secondary in group 1 after 15 seconds And a thread is still alive When I stop a thread Then a distributed table on postgres0 has expected rows diff --git a/features/environment.py b/features/environment.py index 9f9d5fa76..fb4220d00 100644 --- a/features/environment.py +++ b/features/environment.py @@ -1,9 +1,8 @@ import abc import datetime import glob -import os import json -import psutil +import os import re import shutil import signal @@ -13,11 +12,14 @@ import tempfile import threading import time + +from http.server import BaseHTTPRequestHandler, HTTPServer + +import psutil import yaml import patroni.psycopg as psycopg -from http.server import BaseHTTPRequestHandler, HTTPServer from patroni.request import PatroniRequest @@ -498,6 +500,7 @@ def _start(self): def _is_running(self): from patroni.dcs.etcd import DnsCachingResolver + # if etcd is running, but we didn't start it try: self._client = self._client_cls({'host': 'localhost', 'port': 2379, 'retry_timeout': 30, diff --git a/features/quorum_commit.feature b/features/quorum_commit.feature new file mode 100644 index 000000000..2204ab596 --- /dev/null +++ b/features/quorum_commit.feature @@ -0,0 +1,68 @@ +Feature: quorum commit + Check basic workfrlows when quorum commit is enabled + + Scenario: check enable quorum commit and that the only leader promotes after restart + Given I start postgres0 + Then postgres0 is a leader after 10 seconds + And there is a non empty initialize key in DCS after 15 seconds + When I issue a PATCH request to http://127.0.0.1:8008/config with {"ttl": 20, "synchronous_mode": "quorum"} + Then I receive a response code 200 + And sync key in DCS has leader=postgres0 after 20 seconds + And sync key in DCS has quorum=0 after 2 seconds + And synchronous_standby_names on postgres0 is set to "_empty_str_" after 2 seconds + When I shut down postgres0 + And sync key in DCS has leader=postgres0 after 2 seconds + When I start postgres0 + Then postgres0 role is the primary after 10 seconds + When I issue a PATCH request to http://127.0.0.1:8008/config with {"synchronous_mode_strict": true} + Then synchronous_standby_names on postgres0 is set to "ANY 1 (*)" after 10 seconds + + Scenario: check failover with one quorum standby + Given I start postgres1 + Then sync key in DCS has sync_standby=postgres1 after 10 seconds + And synchronous_standby_names on postgres0 is set to "ANY 1 (postgres1)" after 2 seconds + When I shut down postgres0 + Then postgres1 role is the primary after 10 seconds + And sync key in DCS has quorum=0 after 10 seconds + Then synchronous_standby_names on postgres1 is set to "ANY 1 (*)" after 10 seconds + When I start postgres0 + Then sync key in DCS has leader=postgres1 after 10 seconds + Then sync key in DCS has sync_standby=postgres0 after 10 seconds + And synchronous_standby_names on postgres1 is set to "ANY 1 (postgres0)" after 2 seconds + + Scenario: check behavior with three nodes and different replication factor + Given I start postgres2 + Then sync key in DCS has sync_standby=postgres0,postgres2 after 10 seconds + And sync key in DCS has quorum=1 after 2 seconds + And synchronous_standby_names on postgres1 is set to "ANY 1 (postgres0,postgres2)" after 2 seconds + When I issue a PATCH request to http://127.0.0.1:8009/config with {"synchronous_node_count": 2} + Then sync key in DCS has quorum=0 after 10 seconds + And synchronous_standby_names on postgres1 is set to "ANY 2 (postgres0,postgres2)" after 2 seconds + + Scenario: switch from quorum replication to good old multisync and back + Given I issue a PATCH request to http://127.0.0.1:8009/config with {"synchronous_mode": true, "synchronous_node_count": 1} + And I shut down postgres0 + Then synchronous_standby_names on postgres1 is set to "postgres2" after 10 seconds + And sync key in DCS has sync_standby=postgres2 after 10 seconds + Then sync key in DCS has quorum=0 after 2 seconds + When I issue a PATCH request to http://127.0.0.1:8009/config with {"synchronous_mode": "quorum"} + And I start postgres0 + Then synchronous_standby_names on postgres1 is set to "ANY 1 (postgres0,postgres2)" after 10 seconds + And sync key in DCS has sync_standby=postgres0,postgres2 after 10 seconds + Then sync key in DCS has quorum=1 after 2 seconds + + Scenario: REST API and patronictl + Given I run patronictl.py list batman + Then I receive a response returncode 0 + And I receive a response output "Quorum Standby" + And Status code on GET http://127.0.0.1:8008/quorum is 200 after 3 seconds + And Status code on GET http://127.0.0.1:8010/quorum is 200 after 3 seconds + + Scenario: nosync node is removed from voters and synchronous_standby_names + Given I add tag nosync true to postgres2 config + When I issue an empty POST request to http://127.0.0.1:8010/reload + Then I receive a response code 202 + And sync key in DCS has quorum=0 after 10 seconds + And sync key in DCS has sync_standby=postgres0 after 10 seconds + And synchronous_standby_names on postgres1 is set to "ANY 1 (postgres0)" after 2 seconds + And Status code on GET http://127.0.0.1:8010/quorum is 503 after 10 seconds diff --git a/features/steps/basic_replication.py b/features/steps/basic_replication.py index 6cb0ac42b..3ec61ca6b 100644 --- a/features/steps/basic_replication.py +++ b/features/steps/basic_replication.py @@ -1,9 +1,11 @@ import json -import patroni.psycopg as pg -from behave import step, then from time import sleep, time +from behave import step, then + +import patroni.psycopg as pg + @step('I start {name:w}') def start_patroni(context, name): diff --git a/features/steps/citus.py b/features/steps/citus.py index 7277cccce..274a8b514 100644 --- a/features/steps/citus.py +++ b/features/steps/citus.py @@ -1,11 +1,12 @@ import json import time -from behave import step, then -from dateutil import tz from datetime import datetime from functools import partial -from threading import Thread, Event +from threading import Event, Thread + +from behave import step, then +from dateutil import tz tzutc = tz.tzutc() diff --git a/features/steps/patroni_api.py b/features/steps/patroni_api.py index 74a7c0da8..ded5ce2a6 100644 --- a/features/steps/patroni_api.py +++ b/features/steps/patroni_api.py @@ -1,14 +1,16 @@ import json -import parse import shlex import subprocess import sys import time + +from datetime import datetime, timedelta + +import parse import yaml from behave import register_type, step, then from dateutil import tz -from datetime import datetime, timedelta tzutc = tz.tzutc() diff --git a/features/steps/quorum_commit.py b/features/steps/quorum_commit.py new file mode 100644 index 000000000..48fc4b623 --- /dev/null +++ b/features/steps/quorum_commit.py @@ -0,0 +1,60 @@ +import json +import re +import time + +from behave import step, then + + +@step('sync key in DCS has {key:w}={value} after {time_limit:d} seconds') +def check_sync(context, key, value, time_limit): + time_limit *= context.timeout_multiplier + max_time = time.time() + int(time_limit) + dcs_value = None + while time.time() < max_time: + try: + response = json.loads(context.dcs_ctl.query('sync')) + dcs_value = response.get(key) + if key == 'sync_standby' and set((dcs_value or '').split(',')) == set(value.split(',')): + return + elif str(dcs_value) == value: + return + except Exception: + pass + time.sleep(1) + assert False, "sync does not have {0}={1} (found {2}) in dcs after {3} seconds".format(key, value, + dcs_value, time_limit) + + +def _parse_synchronous_standby_names(value): + if '(' in value: + m = re.match(r'.*(\d+) \(([^)]+)\)', value) + expected_value = set(m.group(2).split()) + expected_num = m.group(1) + else: + expected_value = set([value]) + expected_num = '1' + return expected_num, expected_value + + +@then('synchronous_standby_names on {name:2} is set to "{value}" after {time_limit:d} seconds') +def check_synchronous_standby_names(context, name, value, time_limit): + time_limit *= context.timeout_multiplier + max_time = time.time() + int(time_limit) + + if value == '_empty_str_': + value = '' + + expected_num, expected_value = _parse_synchronous_standby_names(value) + + ssn = None + while time.time() < max_time: + try: + ssn = context.pctl.query(name, "SHOW synchronous_standby_names").fetchone()[0] + db_num, db_value = _parse_synchronous_standby_names(ssn) + if expected_value == db_value and expected_num == db_num: + return + except Exception: + pass + time.sleep(1) + assert False, "synchronous_standby_names is not set to '{0}' (found '{1}') after {2} seconds".format(value, ssn, + time_limit) diff --git a/features/steps/slots.py b/features/steps/slots.py index 182aa87c2..84d7e350e 100644 --- a/features/steps/slots.py +++ b/features/steps/slots.py @@ -2,6 +2,7 @@ import time from behave import step, then + import patroni.psycopg as pg diff --git a/features/steps/watchdog.py b/features/steps/watchdog.py index 05eba73c5..ebec646ee 100644 --- a/features/steps/watchdog.py +++ b/features/steps/watchdog.py @@ -1,6 +1,7 @@ -from behave import step, then import time +from behave import step, then + def polling_loop(timeout, interval=1): """Returns an iterator that returns values until timeout has passed. Timeout is measured from start of iteration.""" diff --git a/patroni/__main__.py b/patroni/__main__.py index c4c9a497e..04d53c9ec 100644 --- a/patroni/__main__.py +++ b/patroni/__main__.py @@ -13,7 +13,7 @@ from typing import Any, Dict, List, Optional, TYPE_CHECKING from patroni import MIN_PSYCOPG2, MIN_PSYCOPG3, parse_version -from patroni.daemon import AbstractPatroniDaemon, abstract_main, get_base_arg_parser +from patroni.daemon import abstract_main, AbstractPatroniDaemon, get_base_arg_parser from patroni.tags import Tags if TYPE_CHECKING: # pragma: no cover @@ -268,8 +268,8 @@ def process_arguments() -> Namespace: generate_config(args.configfile, False, args.dsn) sys.exit(0) elif args.validate_config: - from patroni.validator import schema from patroni.config import Config, ConfigParseError + from patroni.validator import schema try: Config(args.configfile, validator=schema) diff --git a/patroni/api.py b/patroni/api.py index 21a1514ac..d74253267 100644 --- a/patroni/api.py +++ b/patroni/api.py @@ -7,32 +7,32 @@ """ import base64 +import datetime import hmac import json import logging -import time -import traceback -import dateutil.parser -import datetime import os import socket import sys +import time +import traceback from http.server import BaseHTTPRequestHandler, HTTPServer from ipaddress import ip_address, ip_network, IPv4Network, IPv6Network from socketserver import ThreadingMixIn from threading import Thread -from urllib.parse import urlparse, parse_qs - from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TYPE_CHECKING, Union +from urllib.parse import parse_qs, urlparse + +import dateutil.parser from . import global_config, psycopg from .__main__ import Patroni from .dcs import Cluster from .exceptions import PostgresConnectionException, PostgresException from .postgresql.misc import postgres_version_to_int -from .utils import deep_compare, enable_keepalive, parse_bool, patch_config, Retry, \ - RetryFailedError, parse_int, split_host_port, tzutc, uri, cluster_as_json +from .utils import cluster_as_json, deep_compare, enable_keepalive, parse_bool, \ + parse_int, patch_config, Retry, RetryFailedError, split_host_port, tzutc, uri logger = logging.getLogger(__name__) @@ -266,6 +266,14 @@ def do_GET(self, write_status_code_only: bool = False) -> None: * HTTP status ``200``: if up and running and without ``noloadbalance`` tag. + * ``/quorum``: + + * HTTP status ``200``: if up and running as a quorum synchronous standby. + + * ``/read-only-quorum``: + + * HTTP status ``200``: if up and running as a quorum synchronous standby or primary. + * ``/synchronous`` or ``/sync``: * HTTP status ``200``: if up and running as a synchronous standby. @@ -346,16 +354,24 @@ def do_GET(self, write_status_code_only: bool = False) -> None: ignore_tags = True elif 'replica' in path: status_code = replica_status_code - elif 'read-only' in path and 'sync' not in path: + elif 'read-only' in path and 'sync' not in path and 'quorum' not in path: status_code = 200 if 200 in (primary_status_code, standby_leader_status_code) else replica_status_code elif 'health' in path: status_code = 200 if response.get('state') == 'running' else 503 elif cluster: # dcs is available + is_quorum = response.get('quorum_standby') is_synchronous = response.get('sync_standby') if path in ('/sync', '/synchronous') and is_synchronous: status_code = replica_status_code - elif path in ('/async', '/asynchronous') and not is_synchronous: + elif path == '/quorum' and is_quorum: + status_code = replica_status_code + elif path in ('/async', '/asynchronous') and not is_synchronous and not is_quorum: status_code = replica_status_code + elif path == '/read-only-quorum': + if 200 in (primary_status_code, standby_leader_status_code): + status_code = 200 + elif is_quorum: + status_code = replica_status_code elif path in ('/read-only-sync', '/read-only-synchronous'): if 200 in (primary_status_code, standby_leader_status_code): status_code = 200 @@ -522,6 +538,7 @@ def do_GET_metrics(self) -> None: * ``patroni_standby_leader``: ``1`` if standby leader node, else ``0``; * ``patroni_replica``: ``1`` if a replica, else ``0``; * ``patroni_sync_standby``: ``1`` if a sync replica, else ``0``; + * ``patroni_quorum_standby``: ``1`` if a quorum sync replica, else ``0``; * ``patroni_xlog_received_location``: ``pg_wal_lsn_diff(pg_last_wal_receive_lsn(), '0/0')``; * ``patroni_xlog_replayed_location``: ``pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '0/0)``; * ``patroni_xlog_replayed_timestamp``: ``pg_last_xact_replay_timestamp``; @@ -584,10 +601,14 @@ def do_GET_metrics(self) -> None: metrics.append("# TYPE patroni_replica gauge") metrics.append("patroni_replica{0} {1}".format(labels, int(postgres['role'] == 'replica'))) - metrics.append("# HELP patroni_sync_standby Value is 1 if this node is a sync standby replica, 0 otherwise.") + metrics.append("# HELP patroni_sync_standby Value is 1 if this node is a sync standby, 0 otherwise.") metrics.append("# TYPE patroni_sync_standby gauge") metrics.append("patroni_sync_standby{0} {1}".format(labels, int(postgres.get('sync_standby', False)))) + metrics.append("# HELP patroni_quorum_standby Value is 1 if this node is a quorum standby, 0 otherwise.") + metrics.append("# TYPE patroni_quorum_standby gauge") + metrics.append("patroni_quorum_standby{0} {1}".format(labels, int(postgres.get('quorum_standby', False)))) + metrics.append("# HELP patroni_xlog_received_location Current location of the received" " Postgres transaction log, 0 if this node is not a replica.") metrics.append("# TYPE patroni_xlog_received_location counter") @@ -1049,16 +1070,17 @@ def is_failover_possible(self, cluster: Cluster, leader: Optional[str], candidat :returns: a string with the error message or ``None`` if good nodes are found. """ - is_synchronous_mode = global_config.from_cluster(cluster).is_synchronous_mode + config = global_config.from_cluster(cluster) if leader and (not cluster.leader or cluster.leader.name != leader): return 'leader name does not match' if candidate: - if action == 'switchover' and is_synchronous_mode and not cluster.sync.matches(candidate): + if action == 'switchover' and config.is_synchronous_mode\ + and not config.is_quorum_commit_mode and not cluster.sync.matches(candidate): return 'candidate name does not match with sync_standby' members = [m for m in cluster.members if m.name == candidate] if not members: return 'candidate does not exists' - elif is_synchronous_mode: + elif config.is_synchronous_mode and not config.is_quorum_commit_mode: members = [m for m in cluster.members if cluster.sync.matches(m.name)] if not members: return action + ' is not possible: can not find sync_standby' @@ -1265,6 +1287,7 @@ def get_postgresql_status(self, retry: bool = False) -> Dict[str, Any]: * ``paused``: ``pg_is_wal_replay_paused()``; * ``sync_standby``: ``True`` if replication mode is synchronous and this is a sync standby; + * ``quorum_standby``: ``True`` if replication mode is quorum and this is a quorum standby; * ``timeline``: PostgreSQL primary node timeline; * ``replication``: :class:`list` of :class:`dict` entries, one for each replication connection. Each entry contains the following keys: @@ -1320,7 +1343,7 @@ def get_postgresql_status(self, retry: bool = False) -> Dict[str, Any]: if result['role'] == 'replica' and config.is_synchronous_mode\ and cluster and cluster.sync.matches(postgresql.name): - result['sync_standby'] = True + result['quorum_standby' if global_config.is_quorum_commit_mode else 'sync_standby'] = True if row[1] > 0: result['timeline'] = row[1] diff --git a/patroni/config.py b/patroni/config.py index 6b46f4bb1..88f3a1a6b 100644 --- a/patroni/config.py +++ b/patroni/config.py @@ -1,15 +1,16 @@ """Facilities related to Patroni configuration.""" -import re import json import logging import os +import re import shutil import tempfile -import yaml from collections import defaultdict from copy import deepcopy -from typing import Any, Callable, Collection, Dict, List, Optional, Union, TYPE_CHECKING +from typing import Any, Callable, Collection, Dict, List, Optional, TYPE_CHECKING, Union + +import yaml from . import PATRONI_ENV_PREFIX from .collections import CaseInsensitiveDict, EMPTY_DICT @@ -17,8 +18,8 @@ from .exceptions import ConfigParseError from .file_perm import pg_perm from .postgresql.config import ConfigHandler -from .validator import IntValidator from .utils import deep_compare, parse_bool, parse_int, patch_config +from .validator import IntValidator logger = logging.getLogger(__name__) @@ -535,7 +536,7 @@ def _set_section_values(section: str, params: List[str]) -> None: _set_section_values('postgresql', ['listen', 'connect_address', 'proxy_address', 'config_dir', 'data_dir', 'pgpass', 'bin_dir']) _set_section_values('log', ['type', 'level', 'traceback_level', 'format', 'dateformat', 'static_fields', - 'max_queue_size', 'dir', 'file_size', 'file_num', 'loggers']) + 'max_queue_size', 'dir', 'mode', 'file_size', 'file_num', 'loggers']) _set_section_values('raft', ['data_dir', 'self_addr', 'partner_addrs', 'password', 'bind_addr']) for binary in ('pg_ctl', 'initdb', 'pg_controldata', 'pg_basebackup', 'postgres', 'pg_isready', 'pg_rewind'): @@ -552,7 +553,7 @@ def _set_section_values(section: str, params: List[str]) -> None: ret[first][second] = value for first, params in (('restapi', ('request_queue_size',)), - ('log', ('max_queue_size', 'file_size', 'file_num'))): + ('log', ('max_queue_size', 'file_size', 'file_num', 'mode'))): for second in params: value = ret.get(first, {}).pop(second, None) if value: @@ -676,23 +677,6 @@ def _get_auth(name: str, params: Collection[str] = _AUTH_ALLOWED_PARAMETERS[:2]) if dcs in ret: ret[dcs].update(_get_auth(dcs)) - users = {} - for param in list(os.environ.keys()): - if param.startswith(PATRONI_ENV_PREFIX): - name, suffix = (param[len(PATRONI_ENV_PREFIX):].rsplit('_', 1) + [''])[:2] - # PATRONI__PASSWORD=, PATRONI__OPTIONS= - # CREATE USER "" WITH PASSWORD '' - if name and suffix == 'PASSWORD': - password = os.environ.pop(param) - if password: - users[name] = {'password': password} - options = os.environ.pop(param[:-9] + '_OPTIONS', None) # replace "_PASSWORD" with "_OPTIONS" - options = options and _parse_list(options) - if options: - users[name]['options'] = options - if users: - ret['bootstrap']['users'] = users - return ret def _build_effective_configuration(self, dynamic_configuration: Dict[str, Any], @@ -756,7 +740,7 @@ def _build_effective_configuration(self, dynamic_configuration: Dict[str, Any], if 'citus' in config: bootstrap = config.setdefault('bootstrap', {}) dcs = bootstrap.setdefault('dcs', {}) - dcs.setdefault('synchronous_mode', True) + dcs.setdefault('synchronous_mode', 'quorum') updated_fields = ( 'name', diff --git a/patroni/config_generator.py b/patroni/config_generator.py index 8a1d6078a..4977962d4 100644 --- a/patroni/config_generator.py +++ b/patroni/config_generator.py @@ -2,14 +2,16 @@ import abc import logging import os -import psutil import socket import sys -import yaml -from getpass import getuser, getpass from contextlib import contextmanager +from getpass import getpass, getuser from typing import Any, Dict, Iterator, List, Optional, TextIO, Tuple, TYPE_CHECKING, Union + +import psutil +import yaml + if TYPE_CHECKING: # pragma: no cover from psycopg import Cursor from psycopg2 import cursor @@ -23,7 +25,6 @@ from .postgresql.misc import postgres_major_version_to_int from .utils import get_major_version, parse_bool, patch_config, read_stripped - # Mapping between the libpq connection parameters and the environment variables. # This dict should be kept in sync with `patroni.utils._AUTH_ALLOWED_PARAMETERS` # (we use "username" in the Patroni config for some reason, other parameter names are the same). diff --git a/patroni/ctl.py b/patroni/ctl.py index 4e05ab8d5..dd813a77e 100644 --- a/patroni/ctl.py +++ b/patroni/ctl.py @@ -12,12 +12,9 @@ If it is also missing in the configuration file we assume that this is just a normal Patroni cluster (not Citus). """ -import click import codecs import copy import datetime -import dateutil.parser -import dateutil.tz import difflib import io import json @@ -28,15 +25,21 @@ import subprocess import sys import tempfile -import urllib3 import time -import yaml from collections import defaultdict from contextlib import contextmanager -from prettytable import ALL, FRAME, PrettyTable +from typing import Any, Dict, Iterator, List, Optional, Tuple, TYPE_CHECKING, Union from urllib.parse import urlparse -from typing import Any, Dict, Iterator, List, Optional, Union, Tuple, TYPE_CHECKING + +import click +import dateutil.parser +import dateutil.tz +import urllib3 +import yaml + +from prettytable import ALL, FRAME, PrettyTable + if TYPE_CHECKING: # pragma: no cover from psycopg import Cursor from psycopg2 import cursor @@ -52,12 +55,12 @@ from . import global_config from .config import Config -from .dcs import get_dcs as _get_dcs, AbstractDCS, Cluster, Member +from .dcs import AbstractDCS, Cluster, get_dcs as _get_dcs, Member from .exceptions import PatroniException from .postgresql.misc import postgres_version_to_int from .postgresql.mpp import get_mpp -from .utils import cluster_as_json, patch_config, polling_loop from .request import PatroniRequest +from .utils import cluster_as_json, patch_config, polling_loop from .version import __version__ CONFIG_DIR_PATH = click.get_app_dir('patroni') diff --git a/patroni/dcs/__init__.py b/patroni/dcs/__init__.py index 7857bd83b..c41b79041 100644 --- a/patroni/dcs/__init__.py +++ b/patroni/dcs/__init__.py @@ -5,22 +5,22 @@ import logging import re import time + from collections import defaultdict from copy import deepcopy from random import randint from threading import Event, Lock from typing import Any, Callable, Collection, Dict, Iterator, List, \ NamedTuple, Optional, Set, Tuple, Type, TYPE_CHECKING, Union -from urllib.parse import urlparse, urlunparse, parse_qsl +from urllib.parse import parse_qsl, urlparse, urlunparse import dateutil.parser from .. import global_config from ..dynamic_loader import iter_classes, iter_modules from ..exceptions import PatroniFatalException -from ..utils import deep_compare, uri from ..tags import Tags -from ..utils import parse_int +from ..utils import deep_compare, parse_int, uri if TYPE_CHECKING: # pragma: no cover from ..config import Config @@ -548,11 +548,15 @@ class SyncState(NamedTuple): :ivar version: modification version of a synchronization key in a Configuration Store. :ivar leader: reference to member that was leader. :ivar sync_standby: synchronous standby list (comma delimited) which are last synchronized to leader. + :ivar quorum: if the node from :attr:`~SyncState.sync_standby` list is doing a leader race it should + see at least :attr:`~SyncState.quorum` other nodes from the + :attr:`~SyncState.sync_standby` + :attr:`~SyncState.leader` list. """ version: Optional[_Version] leader: Optional[str] sync_standby: Optional[str] + quorum: int @staticmethod def from_node(version: Optional[_Version], value: Union[str, Dict[str, Any], None]) -> 'SyncState': @@ -587,7 +591,9 @@ def from_node(version: Optional[_Version], value: Union[str, Dict[str, Any], Non if value and isinstance(value, str): value = json.loads(value) assert isinstance(value, dict) - return SyncState(version, value.get('leader'), value.get('sync_standby')) + leader = value.get('leader') + quorum = value.get('quorum') + return SyncState(version, leader, value.get('sync_standby'), int(quorum) if leader and quorum else 0) except (AssertionError, TypeError, ValueError): return SyncState.empty(version) @@ -599,7 +605,7 @@ def empty(version: Optional[_Version] = None) -> 'SyncState': :returns: empty synchronisation state object. """ - return SyncState(version, None, None) + return SyncState(version, None, None, 0) @property def is_empty(self) -> bool: @@ -617,10 +623,17 @@ def _str_to_list(value: str) -> List[str]: return list(filter(lambda a: a, [s.strip() for s in value.split(',')])) @property - def members(self) -> List[str]: + def voters(self) -> List[str]: """:attr:`~SyncState.sync_standby` as list or an empty list if undefined or object considered ``empty``.""" return self._str_to_list(self.sync_standby) if not self.is_empty and self.sync_standby else [] + @property + def members(self) -> List[str]: + """:attr:`~SyncState.sync_standby` and :attr:`~SyncState.leader` as list + or an empty list if object considered ``empty``. + """ + return [] if not self.leader else [self.leader] + self.voters + def matches(self, name: Optional[str], check_leader: bool = False) -> bool: """Checks if node is presented in the /sync state. @@ -634,7 +647,7 @@ def matches(self, name: Optional[str], check_leader: bool = False) -> bool: the sync state. :Example: - >>> s = SyncState(1, 'foo', 'bar,zoo') + >>> s = SyncState(1, 'foo', 'bar,zoo', 0) >>> s.matches('foo') False @@ -1885,18 +1898,23 @@ def delete_cluster(self) -> bool: """ @staticmethod - def sync_state(leader: Optional[str], sync_standby: Optional[Collection[str]]) -> Dict[str, Any]: + def sync_state(leader: Optional[str], sync_standby: Optional[Collection[str]], + quorum: Optional[int]) -> Dict[str, Any]: """Build ``sync_state`` dictionary. :param leader: name of the leader node that manages ``/sync`` key. :param sync_standby: collection of currently known synchronous standby node names. + :param quorum: if the node from :attr:`~SyncState.sync_standby` list is doing a leader race it should + see at least :attr:`~SyncState.quorum` other nodes from the + :attr:`~SyncState.sync_standby` + :attr:`~SyncState.leader` list :returns: dictionary that later could be serialized to JSON or saved directly to DCS. """ - return {'leader': leader, 'sync_standby': ','.join(sorted(sync_standby)) if sync_standby else None} + return {'leader': leader, 'quorum': quorum, + 'sync_standby': ','.join(sorted(sync_standby)) if sync_standby else None} def write_sync_state(self, leader: Optional[str], sync_standby: Optional[Collection[str]], - version: Optional[Any] = None) -> Optional[SyncState]: + quorum: Optional[int], version: Optional[Any] = None) -> Optional[SyncState]: """Write the new synchronous state to DCS. Calls :meth:`~AbstractDCS.sync_state` to build a dictionary and then calls DCS specific @@ -1905,10 +1923,13 @@ def write_sync_state(self, leader: Optional[str], sync_standby: Optional[Collect :param leader: name of the leader node that manages ``/sync`` key. :param sync_standby: collection of currently known synchronous standby node names. :param version: for conditional update of the key/object. + :param quorum: if the node from :attr:`~SyncState.sync_standby` list is doing a leader race it should + see at least :attr:`~SyncState.quorum` other nodes from the + :attr:`~SyncState.sync_standby` + :attr:`~SyncState.leader` list :returns: the new :class:`SyncState` object or ``None``. """ - sync_value = self.sync_state(leader, sync_standby) + sync_value = self.sync_state(leader, sync_standby, quorum) ret = self.set_sync_state_value(json.dumps(sync_value, separators=(',', ':')), version) if not isinstance(ret, bool): return SyncState.from_node(ret, sync_value) diff --git a/patroni/dcs/consul.py b/patroni/dcs/consul.py index deedfdb14..6a8829db6 100644 --- a/patroni/dcs/consul.py +++ b/patroni/dcs/consul.py @@ -1,4 +1,5 @@ from __future__ import absolute_import + import json import logging import os @@ -6,20 +7,23 @@ import socket import ssl import time -import urllib3 from collections import defaultdict -from consul import ConsulException, NotFound, base from http.client import HTTPException +from typing import Any, Callable, Dict, List, Mapping, NamedTuple, Optional, Tuple, TYPE_CHECKING, Union +from urllib.parse import quote, urlencode, urlparse + +import urllib3 + +from consul import base, ConsulException, NotFound from urllib3.exceptions import HTTPError -from urllib.parse import urlencode, urlparse, quote -from typing import Any, Callable, Dict, List, Mapping, NamedTuple, Optional, Union, Tuple, TYPE_CHECKING -from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, Status, SyncState, \ - TimelineHistory, ReturnFalseException, catch_return_false_exception from ..exceptions import DCSError from ..postgresql.mpp import AbstractMPP from ..utils import deep_compare, parse_bool, Retry, RetryFailedError, split_host_port, uri, USER_AGENT +from . import AbstractDCS, catch_return_false_exception, Cluster, ClusterConfig, \ + Failover, Leader, Member, ReturnFalseException, Status, SyncState, TimelineHistory + if TYPE_CHECKING: # pragma: no cover from ..config import Config diff --git a/patroni/dcs/etcd.py b/patroni/dcs/etcd.py index 0b6c6dd3f..5b9365a8c 100644 --- a/patroni/dcs/etcd.py +++ b/patroni/dcs/etcd.py @@ -1,32 +1,36 @@ from __future__ import absolute_import + import abc -import etcd import json import logging import os -import urllib3.util.connection import random import socket import time from collections import defaultdict from copy import deepcopy -from dns.exception import DNSException -from dns import resolver from http.client import HTTPException from queue import Queue from threading import Thread -from typing import Any, Callable, Collection, Dict, List, Optional, Union, Tuple, Type, TYPE_CHECKING +from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Type, TYPE_CHECKING, Union from urllib.parse import urlparse + +import etcd +import urllib3.util.connection + +from dns import resolver +from dns.exception import DNSException from urllib3 import Timeout -from urllib3.exceptions import HTTPError, ReadTimeoutError, ProtocolError +from urllib3.exceptions import HTTPError, ProtocolError, ReadTimeoutError -from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, Status, SyncState, \ - TimelineHistory, ReturnFalseException, catch_return_false_exception from ..exceptions import DCSError from ..postgresql.mpp import AbstractMPP from ..request import get as requests_get from ..utils import Retry, RetryFailedError, split_host_port, uri, USER_AGENT +from . import AbstractDCS, catch_return_false_exception, Cluster, ClusterConfig, \ + Failover, Leader, Member, ReturnFalseException, Status, SyncState, TimelineHistory + if TYPE_CHECKING: # pragma: no cover from ..config import Config diff --git a/patroni/dcs/etcd3.py b/patroni/dcs/etcd3.py index 4a840c7f3..1539b9eab 100644 --- a/patroni/dcs/etcd3.py +++ b/patroni/dcs/etcd3.py @@ -1,26 +1,29 @@ from __future__ import absolute_import + import base64 -import etcd import json import logging import os import socket import sys import time -import urllib3 from collections import defaultdict from enum import IntEnum -from urllib3.exceptions import ReadTimeoutError, ProtocolError from threading import Condition, Lock, Thread from typing import Any, Callable, Collection, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING, Union -from . import ClusterConfig, Cluster, Failover, Leader, Member, Status, SyncState, \ - TimelineHistory, catch_return_false_exception -from .etcd import AbstractEtcdClientWithFailover, AbstractEtcd, catch_etcd_errors, DnsCachingResolver, Retry +import etcd +import urllib3 + +from urllib3.exceptions import ProtocolError, ReadTimeoutError + from ..exceptions import DCSError, PatroniException from ..postgresql.mpp import AbstractMPP from ..utils import deep_compare, enable_keepalive, iter_response_objects, RetryFailedError, USER_AGENT +from . import catch_return_false_exception, Cluster, ClusterConfig, \ + Failover, Leader, Member, Status, SyncState, TimelineHistory +from .etcd import AbstractEtcd, AbstractEtcdClientWithFailover, catch_etcd_errors, DnsCachingResolver, Retry logger = logging.getLogger(__name__) diff --git a/patroni/dcs/exhibitor.py b/patroni/dcs/exhibitor.py index 03d235758..b5b453190 100644 --- a/patroni/dcs/exhibitor.py +++ b/patroni/dcs/exhibitor.py @@ -5,11 +5,11 @@ from typing import Any, Callable, Dict, List, Union -from . import Cluster -from .zookeeper import ZooKeeper from ..postgresql.mpp import AbstractMPP from ..request import get as requests_get from ..utils import uri +from . import Cluster +from .zookeeper import ZooKeeper logger = logging.getLogger(__name__) diff --git a/patroni/dcs/kubernetes.py b/patroni/dcs/kubernetes.py index 8ed7041f1..d41d9006f 100644 --- a/patroni/dcs/kubernetes.py +++ b/patroni/dcs/kubernetes.py @@ -9,22 +9,25 @@ import socket import tempfile import time -import urllib3 -import yaml from collections import defaultdict from copy import deepcopy from http.client import HTTPException -from urllib3.exceptions import HTTPError from threading import Condition, Lock, Thread -from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Type, Union, TYPE_CHECKING +from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Type, TYPE_CHECKING, Union + +import urllib3 +import yaml + +from urllib3.exceptions import HTTPError -from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, Status, SyncState, TimelineHistory from ..collections import EMPTY_DICT from ..exceptions import DCSError from ..postgresql.mpp import AbstractMPP -from ..utils import deep_compare, iter_response_objects, keepalive_socket_options, \ - Retry, RetryFailedError, tzutc, uri, USER_AGENT +from ..utils import deep_compare, iter_response_objects, \ + keepalive_socket_options, Retry, RetryFailedError, tzutc, uri, USER_AGENT +from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, Status, SyncState, TimelineHistory + if TYPE_CHECKING: # pragma: no cover from ..config import Config @@ -1373,15 +1376,18 @@ def set_sync_state_value(self, value: str, version: Optional[str] = None) -> boo raise NotImplementedError # pragma: no cover def write_sync_state(self, leader: Optional[str], sync_standby: Optional[Collection[str]], - version: Optional[str] = None) -> Optional[SyncState]: + quorum: Optional[int], version: Optional[str] = None) -> Optional[SyncState]: """Prepare and write annotations to $SCOPE-sync Endpoint or ConfigMap. :param leader: name of the leader node that manages /sync key :param sync_standby: collection of currently known synchronous standby node names + :param quorum: if the node from sync_standby list is doing a leader race it should + see at least quorum other nodes from the sync_standby + leader list :param version: last known `resource_version` for conditional update of the object :returns: the new :class:`SyncState` object or None """ - sync_state = self.sync_state(leader, sync_standby) + sync_state = self.sync_state(leader, sync_standby, quorum) + sync_state['quorum'] = str(sync_state['quorum']) if sync_state['quorum'] is not None else None ret = self.patch_or_create(self.sync_path, sync_state, version, False) if not isinstance(ret, bool): return SyncState.from_node(ret.metadata.resource_version, sync_state) @@ -1393,7 +1399,7 @@ def delete_sync_state(self, version: Optional[str] = None) -> bool: :param version: last known `resource_version` for conditional update of the object :returns: `True` if "delete" was successful """ - return self.write_sync_state(None, None, version=version) is not None + return self.write_sync_state(None, None, None, version=version) is not None def watch(self, leader_version: Optional[str], timeout: float) -> bool: if self.__do_not_watch: diff --git a/patroni/dcs/raft.py b/patroni/dcs/raft.py index 9965611ee..9d0ab4a0d 100644 --- a/patroni/dcs/raft.py +++ b/patroni/dcs/raft.py @@ -5,17 +5,19 @@ import time from collections import defaultdict -from pysyncobj import SyncObj, SyncObjConf, replicated, FAIL_REASON +from typing import Any, Callable, Collection, Dict, List, Optional, Set, TYPE_CHECKING, Union + +from pysyncobj import FAIL_REASON, replicated, SyncObj, SyncObjConf from pysyncobj.dns_resolver import globalDnsResolver from pysyncobj.node import TCPNode -from pysyncobj.transport import TCPTransport, CONNECTION_STATE +from pysyncobj.transport import CONNECTION_STATE, TCPTransport from pysyncobj.utility import TcpUtility -from typing import Any, Callable, Collection, Dict, List, Optional, Set, Union, TYPE_CHECKING -from . import AbstractDCS, ClusterConfig, Cluster, Failover, Leader, Member, Status, SyncState, TimelineHistory from ..exceptions import DCSError from ..postgresql.mpp import AbstractMPP from ..utils import validate_directory +from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, Status, SyncState, TimelineHistory + if TYPE_CHECKING: # pragma: no cover from ..config import Config diff --git a/patroni/dcs/zookeeper.py b/patroni/dcs/zookeeper.py index 00c8e4bd7..b66869825 100644 --- a/patroni/dcs/zookeeper.py +++ b/patroni/dcs/zookeeper.py @@ -4,18 +4,20 @@ import socket import time -from kazoo.client import KazooClient, KazooState, KazooRetry -from kazoo.exceptions import ConnectionClosedError, NoNodeError, NodeExistsError, SessionExpiredError +from typing import Any, Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Union + +from kazoo.client import KazooClient, KazooRetry, KazooState +from kazoo.exceptions import ConnectionClosedError, NodeExistsError, NoNodeError, SessionExpiredError from kazoo.handlers.threading import AsyncResult, SequentialThreadingHandler from kazoo.protocol.states import KeeperState, WatchedEvent, ZnodeStat from kazoo.retry import RetryFailedError from kazoo.security import ACL, make_acl -from typing import Any, Callable, Dict, List, Optional, Union, Tuple, TYPE_CHECKING -from . import AbstractDCS, ClusterConfig, Cluster, Failover, Leader, Member, Status, SyncState, TimelineHistory from ..exceptions import DCSError from ..postgresql.mpp import AbstractMPP from ..utils import deep_compare +from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, Status, SyncState, TimelineHistory + if TYPE_CHECKING: # pragma: no cover from ..config import Config diff --git a/patroni/dynamic_loader.py b/patroni/dynamic_loader.py index 6c207349e..48881fe76 100644 --- a/patroni/dynamic_loader.py +++ b/patroni/dynamic_loader.py @@ -5,9 +5,9 @@ import os import pkgutil import sys -from types import ModuleType -from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TYPE_CHECKING, Type, TypeVar, Union +from types import ModuleType +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Type, TYPE_CHECKING, TypeVar, Union if TYPE_CHECKING: # pragma: no cover from .config import Config diff --git a/patroni/file_perm.py b/patroni/file_perm.py index ed9c4e671..62a13f80f 100644 --- a/patroni/file_perm.py +++ b/patroni/file_perm.py @@ -39,19 +39,27 @@ class __FilePermissions: def __init__(self) -> None: """Create a :class:`__FilePermissions` object and set default permissions.""" self.__set_owner_permissions() - self.__set_umask() + self.__orig_umask = self.__set_umask() - def __set_umask(self) -> None: + def __set_umask(self) -> int: """Set umask value based on calculations. .. note:: Should only be called once either :meth:`__set_owner_permissions` or :meth:`__set_group_permissions` has been executed. + + :returns: the previous value of the umask or ``0022`` if umask call failed. """ try: - os.umask(self.__pg_mode_mask) + return os.umask(self.__pg_mode_mask) except Exception as e: logger.error('Can not set umask to %03o: %r', self.__pg_mode_mask, e) + return 0o22 + + @property + def orig_umask(self) -> int: + """Original umask value.""" + return self.__orig_umask def __set_owner_permissions(self) -> None: """Make directories/files accessible only by the owner.""" diff --git a/patroni/global_config.py b/patroni/global_config.py index 5423d7e58..403b29af2 100644 --- a/patroni/global_config.py +++ b/patroni/global_config.py @@ -8,7 +8,7 @@ import types from copy import deepcopy -from typing import Any, Dict, List, Optional, Union, TYPE_CHECKING +from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union from .collections import EMPTY_DICT from .utils import parse_bool, parse_int @@ -105,10 +105,16 @@ def is_paused(self) -> bool: """``True`` if cluster is in maintenance mode.""" return self.check_mode('pause') + @property + def is_quorum_commit_mode(self) -> bool: + """:returns: ``True`` if quorum commit replication is requested""" + return str(self.get('synchronous_mode')).lower() == 'quorum' + @property def is_synchronous_mode(self) -> bool: """``True`` if synchronous replication is requested and it is not a standby cluster config.""" - return self.check_mode('synchronous_mode') and not self.is_standby_cluster + return (self.check_mode('synchronous_mode') is True or self.is_quorum_commit_mode) \ + and not self.is_standby_cluster @property def is_synchronous_mode_strict(self) -> bool: diff --git a/patroni/ha.py b/patroni/ha.py index 01c629688..72d52b32b 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -8,18 +8,19 @@ from multiprocessing.pool import ThreadPool from threading import RLock -from typing import Any, Callable, Collection, Dict, List, NamedTuple, Optional, Union, Tuple, TYPE_CHECKING +from typing import Any, Callable, Collection, Dict, List, NamedTuple, Optional, Tuple, TYPE_CHECKING, Union from . import global_config, psycopg from .__main__ import Patroni from .async_executor import AsyncExecutor, CriticalTask from .collections import CaseInsensitiveSet -from .dcs import AbstractDCS, Cluster, Leader, Member, RemoteMember, Status, slot_name_from_member_name -from .exceptions import DCSError, PostgresConnectionException, PatroniFatalException +from .dcs import AbstractDCS, Cluster, Leader, Member, RemoteMember, slot_name_from_member_name, Status, SyncState +from .exceptions import DCSError, PatroniFatalException, PostgresConnectionException from .postgresql.callback_executor import CallbackAction from .postgresql.misc import postgres_version_to_int from .postgresql.postmaster import PostmasterProcess from .postgresql.rewind import Rewind +from .quorum import QuorumStateResolver from .tags import Tags from .utils import parse_int, polling_loop, tzutc @@ -231,6 +232,7 @@ def __init__(self, patroni: Patroni): self._leader_expiry_lock = RLock() self._failsafe = Failsafe(patroni.dcs) self._was_paused = False + self._promote_timestamp = 0 self._leader_timeline = None self.recovering = False self._async_response = CriticalTask() @@ -296,6 +298,8 @@ def set_is_leader(self, value: bool) -> None: """ with self._leader_expiry_lock: self._leader_expiry = time.time() + self.dcs.ttl if value else 0 + if not value: + self._promote_timestamp = 0 def sync_mode_is_active(self) -> bool: """Check whether synchronous replication is requested and already active. @@ -304,6 +308,13 @@ def sync_mode_is_active(self) -> bool: """ return self.is_synchronous_mode() and not self.cluster.sync.is_empty + def quorum_commit_mode_is_active(self) -> bool: + """Checks whether quorum replication is requested and already active. + + :returns: ``True`` if the primary already put its name into the ``/sync`` in DCS. + """ + return self.is_quorum_commit_mode() and not self.cluster.sync.is_empty + def _get_failover_action_name(self) -> str: """Return the currently requested manual failover action name or the default ``failover``. @@ -771,81 +782,221 @@ def is_synchronous_mode(self) -> bool: """:returns: `True` if synchronous replication is requested.""" return global_config.is_synchronous_mode + def is_quorum_commit_mode(self) -> bool: + """``True`` if quorum commit replication is requested and "supported".""" + return global_config.is_quorum_commit_mode and self.state_handler.supports_multiple_sync + def is_failsafe_mode(self) -> bool: """:returns: `True` if failsafe_mode is enabled in global configuration.""" return global_config.check_mode('failsafe_mode') - def process_sync_replication(self) -> None: - """Process synchronous standby beahvior. + def _maybe_enable_synchronous_mode(self) -> Optional[SyncState]: + """Explicitly enable synchronous mode if not yet enabled. + + We are trying to solve a corner case: synchronous mode needs to be explicitly enabled + by updating the ``/sync`` key with the current leader name and empty members. In opposite + case it will never be automatically enabled if there are no eligible candidates. + + :returns: the latest version of :class:`~patroni.dcs.SyncState` object. + """ + sync = self.cluster.sync + if sync.is_empty: + sync = self.dcs.write_sync_state(self.state_handler.name, None, 0, version=sync.version) + if sync: + logger.info("Enabled synchronous replication") + else: + logger.warning("Updating sync state failed") + return sync + + def disable_synchronous_replication(self) -> None: + """Cleans up ``/sync`` key in DCS and updates ``synchronous_standby_names``. + + .. note:: + We fall back to using the value configured by the user for ``synchronous_standby_names``, if any. + """ + # If synchronous_mode was turned off, we need to update synchronous_standby_names in Postgres + if not self.cluster.sync.is_empty and self.dcs.delete_sync_state(version=self.cluster.sync.version): + logger.info("Disabled synchronous replication") + self.state_handler.sync_handler.set_synchronous_standby_names(CaseInsensitiveSet()) + + # As synchronous_mode is off, check if the user configured Postgres synchronous replication instead + ssn = self.state_handler.config.synchronous_standby_names + self.state_handler.config.set_synchronous_standby_names(ssn) + + def _process_quorum_replication(self) -> None: + """Process synchronous replication state when quorum commit is requested. + + Synchronous standbys are registered in two places: ``postgresql.conf`` and DCS. The order of updating them must + keep the invariant that ``quorum + sync >= len(set(quorum pool)|set(sync pool))``. This is done using + :class:`QuorumStateResolver` that given a current state and set of desired synchronous nodes and replication + level outputs changes to DCS and synchronous replication in correct order to reach the desired state. + In case any of those steps causes an error we can just bail out and let next iteration rediscover the state + and retry necessary transitions. + """ + start_time = time.time() + + min_sync = global_config.min_synchronous_nodes + sync_wanted = global_config.synchronous_node_count + + sync = self._maybe_enable_synchronous_mode() + if not sync or not sync.leader: + return + + leader = sync.leader + + def _check_timeout(offset: float = 0) -> bool: + return time.time() - start_time + offset >= self.dcs.loop_wait + + while True: + transition = 'break' # we need define transition value if `QuorumStateResolver` produced no changes + sync_state = self.state_handler.sync_handler.current_state(self.cluster) + for transition, leader, num, nodes in QuorumStateResolver(leader=leader, + quorum=sync.quorum, + voters=sync.voters, + numsync=sync_state.numsync, + sync=sync_state.sync, + numsync_confirmed=sync_state.numsync_confirmed, + active=sync_state.active, + sync_wanted=sync_wanted, + leader_wanted=self.state_handler.name): + if _check_timeout(): + return + + if transition == 'quorum': + logger.info("Setting leader to %s, quorum to %d of %d (%s)", + leader, num, len(nodes), ", ".join(sorted(nodes))) + sync = self.dcs.write_sync_state(leader, nodes, num, version=sync.version) + if not sync: + return logger.info('Synchronous replication key updated by someone else.') + elif transition == 'sync': + logger.info("Setting synchronous replication to %d of %d (%s)", + num, len(nodes), ", ".join(sorted(nodes))) + # Bump up number of num nodes to meet minimum replication factor. Commits will have to wait until + # we have enough nodes to meet replication target. + if num < min_sync: + logger.warning("Replication factor %d requested, but %d synchronous standbys available." + " Commits will be delayed.", min_sync + 1, num) + num = min_sync + self.state_handler.sync_handler.set_synchronous_standby_names(nodes, num) + if transition != 'restart' or _check_timeout(1): + return + # synchronous_standby_names was transitioned from empty to non-empty and it may take + # some time for nodes to become synchronous. In this case we want to restart state machine + # hoping that we can update /sync key earlier than in loop_wait seconds. + time.sleep(1) + self.state_handler.reset_cluster_info_state(None) + + def _process_multisync_replication(self) -> None: + """Process synchronous replication state with one or more sync standbys. Synchronous standbys are registered in two places postgresql.conf and DCS. The order of updating them must be right. The invariant that should be kept is that if a node is primary and sync_standby is set in DCS, then that node must have synchronous_standby set to that value. Or more simple, first set in postgresql.conf and then in DCS. When removing, first remove in DCS, then in postgresql.conf. This is so we only consider promoting standbys that were guaranteed to be replicating synchronously. - - .. note:: - If ``synchronous_mode`` is disabled, we fall back to using the value configured by the user for - ``synchronous_standby_names``, if any. """ - if self.is_synchronous_mode(): - sync = self.cluster.sync - if sync.is_empty: - # corner case: we need to explicitly enable synchronous mode by updating the - # ``/sync`` key with the current leader name and empty members. In opposite case - # it will never be automatically enabled if there are not eligible candidates. - sync = self.dcs.write_sync_state(self.state_handler.name, None, version=sync.version) - if not sync: - return logger.warning("Updating sync state failed") - logger.info("Enabled synchronous replication") + sync = self._maybe_enable_synchronous_mode() + if not sync: + return - current = CaseInsensitiveSet(sync.members) - picked, allow_promote = self.state_handler.sync_handler.current_state(self.cluster) - - if picked == current and current != allow_promote: - logger.warning('Inconsistent state between synchronous_standby_names = %s and /sync = %s key ' - 'detected, updating synchronous replication key...', list(allow_promote), list(current)) - sync = self.dcs.write_sync_state(self.state_handler.name, allow_promote, version=sync.version) - if not sync: - return logger.warning("Updating sync state failed") - current = CaseInsensitiveSet(sync.members) - - if picked != current: - # update synchronous standby list in dcs temporarily to point to common nodes in current and picked - sync_common = current & allow_promote - if sync_common != current: - logger.info("Updating synchronous privilege temporarily from %s to %s", - list(current), list(sync_common)) - sync = self.dcs.write_sync_state(self.state_handler.name, sync_common, version=sync.version) - if not sync: - return logger.info('Synchronous replication key updated by someone else.') + current_state = self.state_handler.sync_handler.current_state(self.cluster) + picked = current_state.active + allow_promote = current_state.sync + voters = CaseInsensitiveSet(sync.voters) - # When strict mode and no suitable replication connections put "*" to synchronous_standby_names - if global_config.is_synchronous_mode_strict and not picked: - picked = CaseInsensitiveSet('*') - logger.warning("No standbys available!") - - # Update postgresql.conf and wait 2 secs for changes to become active - logger.info("Assigning synchronous standby status to %s", list(picked)) - self.state_handler.sync_handler.set_synchronous_standby_names(picked) - - if picked and picked != CaseInsensitiveSet('*') and allow_promote != picked: - # Wait for PostgreSQL to enable synchronous mode and see if we can immediately set sync_standby - time.sleep(2) - _, allow_promote = self.state_handler.sync_handler.current_state(self.cluster) - if allow_promote and allow_promote != sync_common: - if not self.dcs.write_sync_state(self.state_handler.name, allow_promote, version=sync.version): - return logger.info("Synchronous replication key updated by someone else") - logger.info("Synchronous standby status assigned to %s", list(allow_promote)) + if picked == voters and voters != allow_promote: + logger.warning('Inconsistent state between synchronous_standby_names = %s and /sync = %s key ' + 'detected, updating synchronous replication key...', list(allow_promote), list(voters)) + sync = self.dcs.write_sync_state(self.state_handler.name, allow_promote, 0, version=sync.version) + if not sync: + return logger.warning("Updating sync state failed") + voters = CaseInsensitiveSet(sync.voters) + + if picked == voters: + return + + # update synchronous standby list in dcs temporarily to point to common nodes in current and picked + sync_common = voters & allow_promote + if sync_common != voters: + logger.info("Updating synchronous privilege temporarily from %s to %s", + list(voters), list(sync_common)) + sync = self.dcs.write_sync_state(self.state_handler.name, sync_common, 0, version=sync.version) + if not sync: + return logger.info('Synchronous replication key updated by someone else.') + + # When strict mode and no suitable replication connections put "*" to synchronous_standby_names + if global_config.is_synchronous_mode_strict and not picked: + picked = CaseInsensitiveSet('*') + logger.warning("No standbys available!") + + # Update postgresql.conf and wait 2 secs for changes to become active + logger.info("Assigning synchronous standby status to %s", list(picked)) + self.state_handler.sync_handler.set_synchronous_standby_names(picked) + + if picked and picked != CaseInsensitiveSet('*') and allow_promote != picked: + # Wait for PostgreSQL to enable synchronous mode and see if we can immediately set sync_standby + time.sleep(2) + allow_promote = self.state_handler.sync_handler.current_state(self.cluster).sync + + if allow_promote and allow_promote != sync_common: + if self.dcs.write_sync_state(self.state_handler.name, allow_promote, 0, version=sync.version): + logger.info("Synchronous standby status assigned to %s", list(allow_promote)) + else: + logger.info("Synchronous replication key updated by someone else") + + def process_sync_replication(self) -> None: + """Process synchronous replication behavior on the primary.""" + if self.is_quorum_commit_mode(): + # The synchronous_standby_names was adjusted right before promote. + # After that, when postgres has become a primary, we need to reflect this change + # in the /sync key. Further changes of synchronous_standby_names and /sync key should + # be postponed for `loop_wait` seconds, to give a chance to some replicas to start streaming. + # In opposite case the /sync key will end up without synchronous nodes. + if self.state_handler.is_primary(): + if self._promote_timestamp == 0 or time.time() - self._promote_timestamp > self.dcs.loop_wait: + self._process_quorum_replication() + if self._promote_timestamp == 0: + self._promote_timestamp = time.time() + elif self.is_synchronous_mode(): + self._process_multisync_replication() else: - # If synchronous_mode was turned off, we need to update synchronous_standby_names in Postgres - if not self.cluster.sync.is_empty and self.dcs.delete_sync_state(version=self.cluster.sync.version): - logger.info("Disabled synchronous replication") - self.state_handler.sync_handler.set_synchronous_standby_names(CaseInsensitiveSet()) + self.disable_synchronous_replication() + + def process_sync_replication_prepromote(self) -> bool: + """Handle sync replication state before promote. + + If quorum replication is requested, and we can keep syncing to enough nodes satisfying the quorum invariant + we can promote immediately and let normal quorum resolver process handle any membership changes later. + Otherwise, we will just reset DCS state to ourselves and add replicas as they connect. + + :returns: ``True`` if on success or ``False`` if failed to update /sync key in DCS. + """ + if not self.is_synchronous_mode(): + self.disable_synchronous_replication() + return True + + if self.quorum_commit_mode_is_active(): + sync = CaseInsensitiveSet(self.cluster.sync.members) + numsync = len(sync) - self.cluster.sync.quorum - 1 + if self.state_handler.name not in sync: # Node outside voters achieved quorum and got leader + numsync += 1 + else: + sync.discard(self.state_handler.name) + else: + sync = CaseInsensitiveSet() + numsync = global_config.min_synchronous_nodes - # As synchronous_mode is off, check if the user configured Postgres synchronous replication instead - ssn = self.state_handler.config.synchronous_standby_names - self.state_handler.config.set_synchronous_standby_names(ssn) + if not self.is_quorum_commit_mode() or not self.state_handler.supports_multiple_sync and numsync > 1: + sync = CaseInsensitiveSet() + numsync = global_config.min_synchronous_nodes + + # Just set ourselves as the authoritative source of truth for now. We don't want to wait for standbys + # to connect. We will try finding a synchronous standby in the next cycle. + if not self.dcs.write_sync_state(self.state_handler.name, None, 0, version=self.cluster.sync.version): + return False + + self.state_handler.sync_handler.set_synchronous_standby_names(sync, numsync) + return True def is_sync_standby(self, cluster: Cluster) -> bool: """:returns: `True` if the current node is a synchronous standby.""" @@ -956,15 +1107,10 @@ def enforce_primary_role(self, message: str, promote_message: str) -> str: self.process_sync_replication() return message else: - if self.is_synchronous_mode(): - # Just set ourselves as the authoritative source of truth for now. We don't want to wait for standbys - # to connect. We will try finding a synchronous standby in the next cycle. - if not self.dcs.write_sync_state(self.state_handler.name, None, version=self.cluster.sync.version): - # Somebody else updated sync state, it may be due to us losing the lock. To be safe, postpone - # promotion until next cycle. TODO: trigger immediate retry of run_cycle - return 'Postponing promotion because synchronous replication state was updated by somebody else' - self.state_handler.sync_handler.set_synchronous_standby_names( - CaseInsensitiveSet('*') if global_config.is_synchronous_mode_strict else CaseInsensitiveSet()) + if not self.process_sync_replication_prepromote(): + # Somebody else updated sync state, it may be due to us losing the lock. To be safe, + # postpone promotion until next cycle. TODO: trigger immediate retry of run_cycle. + return 'Postponing promotion because synchronous replication state was updated by somebody else' if self.state_handler.role not in ('master', 'promoted', 'primary'): # reset failsafe state when promote self._failsafe.set_is_active(0) @@ -980,10 +1126,14 @@ def before_promote(): return promote_message def fetch_node_status(self, member: Member) -> _MemberStatus: - """This function perform http get request on member.api_url and fetches its status - :returns: `_MemberStatus` object - """ + """Perform http get request on member.api_url to fetch its status. + + Usually this happens during the leader race and we can't afford to wait an indefinite time + for a response, therefore the request timeout is hardcoded to 2 seconds, which seems to be a + good compromise. The node which is slow to respond is most likely unhealthy. + :returns: :class:`_MemberStatus` object + """ try: response = self.patroni.request(member, timeout=2, retries=0) data = response.data.decode('utf-8') @@ -1092,18 +1242,26 @@ def check_failsafe_topology(self) -> bool: return ret def is_lagging(self, wal_position: int) -> bool: - """Returns if instance with an wal should consider itself unhealthy to be promoted due to replication lag. + """Check if node should consider itself unhealthy to be promoted due to replication lag. :param wal_position: Current wal position. - :returns True when node is lagging + :returns: ``True`` when node is lagging """ lag = self.cluster.status.last_lsn - wal_position return lag > global_config.maximum_lag_on_failover def _is_healthiest_node(self, members: Collection[Member], check_replication_lag: bool = True) -> bool: - """This method tries to determine whether I am healthy enough to became a new leader candidate or not.""" - + """Determine whether the current node is healthy enough to become a new leader candidate. + + :param members: the list of nodes to check against + :param check_replication_lag: whether to take the replication lag into account. + If the lag exceeds configured threshold the node disqualifies itself. + :returns: ``True`` if the node is eligible to become the new leader. Since this method is executed + on multiple nodes independently it is possible that multiple nodes could count + themselves as the healthiest because they received/replayed up to the same LSN, + but this is totally fine. + """ my_wal_position = self.state_handler.last_operation() if check_replication_lag and self.is_lagging(my_wal_position): logger.info('My wal position exceeds maximum replication lag') @@ -1119,8 +1277,26 @@ def _is_healthiest_node(self, members: Collection[Member], check_replication_lag logger.info('My timeline %s is behind last known cluster timeline %s', my_timeline, cluster_timeline) return False - # Prepare list of nodes to run check against - members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url] + if self.quorum_commit_mode_is_active(): + quorum = self.cluster.sync.quorum + voting_set = CaseInsensitiveSet(self.cluster.sync.members) + else: + quorum = 0 + voting_set = CaseInsensitiveSet() + + # Prepare list of nodes to run check against. If quorum commit is enabled + # we also include members with nofailover tag if they are listed in voters. + members = [m for m in members if m.name != self.state_handler.name + and m.api_url and (not m.nofailover or m.name in voting_set)] + + # If there is a quorum active then at least one of the quorum contains latest commit. A quorum member saying + # their WAL position is not ahead counts as a vote saying we may become new leader. Note that a node doesn't + # have to be a member of the voting set to gather the necessary votes. + + # Regardless of voting, if we observe a node that can become a leader and is ahead, we defer to that node. + # This can lead to failure to act on quorum if there is asymmetric connectivity. + quorum_votes = 0 if self.state_handler.name in voting_set else -1 + nodes_ahead = 0 for st in self.fetch_nodes_statuses(members): if st.failover_limitation() is None: @@ -1128,22 +1304,34 @@ def _is_healthiest_node(self, members: Collection[Member], check_replication_lag logger.warning('Primary (%s) is still alive', st.member.name) return False if my_wal_position < st.wal_position: + nodes_ahead += 1 logger.info('Wal position of %s is ahead of my wal position', st.member.name) # In synchronous mode the former leader might be still accessible and even be ahead of us. # We should not disqualify himself from the leader race in such a situation. if not self.sync_mode_is_active() or not self.cluster.sync.leader_matches(st.member.name): return False logger.info('Ignoring the former leader being ahead of us') - if my_wal_position == st.wal_position and self.patroni.failover_priority < st.failover_priority: - # There's a higher priority non-lagging replica - logger.info( - '%s has equally tolerable WAL position and priority %s, while this node has priority %s', - st.member.name, - st.failover_priority, - self.patroni.failover_priority, - ) - return False - return True + elif st.wal_position > 0: # we want to count votes only from nodes with postgres up and running! + quorum_vote = st.member.name in voting_set + low_priority = my_wal_position == st.wal_position \ + and self.patroni.failover_priority < st.failover_priority + + if low_priority and (not self.sync_mode_is_active() or quorum_vote): + # There's a higher priority non-lagging replica + logger.info( + '%s has equally tolerable WAL position and priority %s, while this node has priority %s', + st.member.name, st.failover_priority, self.patroni.failover_priority) + return False + + if quorum_vote: + logger.info('Got quorum vote from %s', st.member.name) + quorum_votes += 1 + + # When not in quorum commit we just want to return `True`. + # In quorum commit the former leader is special and counted healthy even when there are no other nodes. + # Otherwise check that the number of votes exceeds the quorum field from the /sync key. + return not self.quorum_commit_mode_is_active() or quorum_votes >= quorum\ + or nodes_ahead == 0 and self.cluster.sync.leader == self.state_handler.name def is_failover_possible(self, *, cluster_lsn: int = 0, exclude_failover_candidate: bool = False) -> bool: """Checks whether any of the cluster members is allowed to promote and is healthy enough for that. @@ -1203,9 +1391,10 @@ def manual_failover_process_no_leader(self) -> Optional[bool]: return None return False - # in synchronous mode when our name is not in the /sync key - # we shouldn't take any action even if the candidate is unhealthy - if self.is_synchronous_mode() and not self.cluster.sync.matches(self.state_handler.name, True): + # in synchronous mode (except quorum commit!) when our name is not in the + # /sync key we shouldn't take any action even if the candidate is unhealthy + if self.is_synchronous_mode() and not self.is_quorum_commit_mode()\ + and not self.cluster.sync.matches(self.state_handler.name, True): return False # find specific node and check that it is healthy @@ -1303,9 +1492,11 @@ def is_healthiest_node(self) -> bool: all_known_members += [RemoteMember(name, {'api_url': url}) for name, url in failsafe_members.items()] all_known_members += self.cluster.members - # When in sync mode, only last known primary and sync standby are allowed to promote automatically. + # Special handling if synchronous mode was requested and activated (the leader in /sync is not empty) if self.sync_mode_is_active(): - if not self.cluster.sync.matches(self.state_handler.name, True): + # In quorum commit mode we allow nodes outside of "voters" to take part in + # the leader race. They just need to get enough votes to `reach quorum + 1`. + if not self.is_quorum_commit_mode() and not self.cluster.sync.matches(self.state_handler.name, True): return False # pick between synchronous candidates so we minimize unnecessary failovers/demotions members = {m.name: m for m in all_known_members if self.cluster.sync.matches(m.name, True)} @@ -2205,8 +2396,11 @@ def get_failover_candidates(self, exclude_failover_candidate: bool) -> List[Memb exclude = [self.state_handler.name] + ([failover.candidate] if failover and exclude_failover_candidate else []) def is_eligible(node: Member) -> bool: + # If quorum commit is requested we want to check all nodes (even not voters), + # because they could get enough votes and reach necessary quorum + 1. # in synchronous mode we allow failover (not switchover!) to async node - if self.sync_mode_is_active() and not self.cluster.sync.matches(node.name)\ + if self.sync_mode_is_active()\ + and not (self.is_quorum_commit_mode() or self.cluster.sync.matches(node.name))\ and not (failover and not failover.leader): return False # Don't spend time on "nofailover" nodes checking. diff --git a/patroni/log.py b/patroni/log.py index 2f383b0c3..618ebf49b 100644 --- a/patroni/log.py +++ b/patroni/log.py @@ -8,19 +8,52 @@ import sys from copy import deepcopy +from io import TextIOWrapper from logging.handlers import RotatingFileHandler -from queue import Queue, Full +from queue import Full, Queue from threading import Lock, Thread +from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union -from typing import Any, Dict, List, Optional, Union, TYPE_CHECKING - -from .utils import deep_compare +from .file_perm import pg_perm +from .utils import deep_compare, parse_int type_logformat = Union[List[Union[str, Dict[str, Any], Any]], str, Any] _LOGGER = logging.getLogger(__name__) +class PatroniFileHandler(RotatingFileHandler): + """Wrapper of :class:`RotatingFileHandler` to handle permissions of log files. """ + + def __init__(self, filename: str, mode: Optional[int]) -> None: + """Create a new :class:`PatroniFileHandler` instance. + + :param filename: basename for log files. + :param mode: permissions for log files. + """ + self.set_log_file_mode(mode) + super(PatroniFileHandler, self).__init__(filename) + + def set_log_file_mode(self, mode: Optional[int]) -> None: + """Set mode for Patroni log files. + + :param mode: permissions for log files. + + .. note:: + If *mode* is not specified, we calculate it from the `umask` value. + """ + self._log_file_mode = 0o666 & ~pg_perm.orig_umask if mode is None else mode + + def _open(self) -> TextIOWrapper: + """Open a new log file and assign permissions. + + :returns: the resulting stream. + """ + ret = super(PatroniFileHandler, self)._open() + os.chmod(self.baseFilename, self._log_file_mode) + return ret + + def debug_exception(self: logging.Logger, msg: object, *args: Any, **kwargs: Any) -> None: """Add full stack trace info to debug log messages and partial to others. @@ -423,15 +456,16 @@ def reload_config(self, config: Dict[str, Any]) -> None: handler = self.log_handler if 'dir' in config: - if not isinstance(handler, RotatingFileHandler): - handler = RotatingFileHandler(os.path.join(config['dir'], __name__)) - + mode = parse_int(config.get('mode')) + if not isinstance(handler, PatroniFileHandler): + handler = PatroniFileHandler(os.path.join(config['dir'], __name__), mode) + handler.set_log_file_mode(mode) max_file_size = int(config.get('file_size', 25000000)) handler.maxBytes = max_file_size # pyright: ignore [reportAttributeAccessIssue] handler.backupCount = int(config.get('file_num', 4)) # we can't use `if not isinstance(handler, logging.StreamHandler)` below, - # because RotatingFileHandler is a child of StreamHandler!!! - elif handler is None or isinstance(handler, RotatingFileHandler): + # because RotatingFileHandler and PatroniFileHandler are children of StreamHandler!!! + elif handler is None or isinstance(handler, PatroniFileHandler): handler = logging.StreamHandler() is_new_handler = handler != self.log_handler @@ -454,7 +488,7 @@ def _close_old_handlers(self) -> None: .. note:: It is used to remove different handlers that were configured previous to a reload in the configuration, - e.g. if we are switching from :class:`~logging.handlers.RotatingFileHandler` to + e.g. if we are switching from :class:`PatroniFileHandler` to class:`~logging.StreamHandler` and vice-versa. """ while True: diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py index 0ed5bd3b0..2876951cb 100644 --- a/patroni/postgresql/__init__.py +++ b/patroni/postgresql/__init__.py @@ -9,11 +9,19 @@ from contextlib import contextmanager from copy import deepcopy from datetime import datetime +from threading import current_thread, Lock +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TYPE_CHECKING, Union + from dateutil import tz from psutil import TimeoutExpired -from threading import current_thread, Lock -from typing import Any, Callable, Dict, Iterator, List, Optional, Union, Tuple, TYPE_CHECKING +from .. import global_config, psycopg +from ..async_executor import CriticalTask +from ..collections import CaseInsensitiveDict, CaseInsensitiveSet, EMPTY_DICT +from ..dcs import Cluster, Leader, Member +from ..exceptions import PostgresConnectionException +from ..tags import Tags +from ..utils import data_directory_is_empty, parse_int, polling_loop, Retry, RetryFailedError from .bootstrap import Bootstrap from .callback_executor import CallbackAction, CallbackExecutor from .cancellable import CancellableSubprocess @@ -24,13 +32,6 @@ from .postmaster import PostmasterProcess from .slots import SlotsHandler from .sync import SyncHandler -from .. import global_config, psycopg -from ..async_executor import CriticalTask -from ..collections import CaseInsensitiveSet, CaseInsensitiveDict, EMPTY_DICT -from ..dcs import Cluster, Leader, Member -from ..exceptions import PostgresConnectionException -from ..utils import Retry, RetryFailedError, polling_loop, data_directory_is_empty, parse_int -from ..tags import Tags if TYPE_CHECKING: # pragma: no cover from psycopg import Connection as Connection3, Cursor @@ -181,6 +182,11 @@ def wal_flush(self) -> str: def lsn_name(self) -> str: return 'lsn' if self._major_version >= 100000 else 'location' + @property + def supports_quorum_commit(self) -> bool: + """``True`` if quorum commit is supported by Postgres.""" + return self._major_version >= 100000 + @property def supports_multiple_sync(self) -> bool: """:returns: `True` if Postgres version supports more than one synchronous node.""" diff --git a/patroni/postgresql/bootstrap.py b/patroni/postgresql/bootstrap.py index 47dbacfe7..e355dec20 100644 --- a/patroni/postgresql/bootstrap.py +++ b/patroni/postgresql/bootstrap.py @@ -4,7 +4,7 @@ import tempfile import time -from typing import Any, Callable, Dict, List, Optional, Union, Tuple, TYPE_CHECKING +from typing import Any, Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Union from ..async_executor import CriticalTask from ..collections import EMPTY_DICT @@ -444,11 +444,8 @@ def post_bootstrap(self, config: Dict[str, Any], task: CriticalTask) -> Optional postgresql.query(sql) if config.get('users'): - logger.warning('User creation via "bootstrap.users" will be removed in v4.0.0') - - for name, value in (config.get('users') or EMPTY_DICT).items(): - if all(name != a.get('username') for a in (superuser, replication, rewind)): - self.create_or_update_role(name, value.get('password'), value.get('options', [])) + logger.error('User creation is not be supported starting from v4.0.0. ' + 'Please use "boostrap.post_bootstrap" script to create users.') # We were doing a custom bootstrap instead of running initdb, therefore we opened trust # access from certain addresses to be able to reach cluster and change password diff --git a/patroni/postgresql/cancellable.py b/patroni/postgresql/cancellable.py index e44729b07..5e75fd203 100644 --- a/patroni/postgresql/cancellable.py +++ b/patroni/postgresql/cancellable.py @@ -1,12 +1,14 @@ import logging -import psutil import subprocess -from patroni.exceptions import PostgresException -from patroni.utils import polling_loop from threading import Lock from typing import Any, Dict, List, Optional, Union +import psutil + +from patroni.exceptions import PostgresException +from patroni.utils import polling_loop + logger = logging.getLogger(__name__) diff --git a/patroni/postgresql/config.py b/patroni/postgresql/config.py index f7678045d..f857bbdfc 100644 --- a/patroni/postgresql/config.py +++ b/patroni/postgresql/config.py @@ -7,19 +7,19 @@ import time from contextlib import contextmanager -from urllib.parse import urlparse, parse_qsl, unquote from types import TracebackType -from typing import Any, Callable, Collection, Dict, Iterator, List, Optional, Union, Tuple, Type, TYPE_CHECKING +from typing import Any, Callable, Collection, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING, Union +from urllib.parse import parse_qsl, unquote, urlparse -from .validator import recovery_parameters, transform_postgresql_parameter_value, transform_recovery_parameter_value from .. import global_config from ..collections import CaseInsensitiveDict, CaseInsensitiveSet, EMPTY_DICT from ..dcs import Leader, Member, RemoteMember, slot_name_from_member_name from ..exceptions import PatroniFatalException, PostgresConnectionException from ..file_perm import pg_perm -from ..utils import (compare_values, maybe_convert_from_base_unit, parse_bool, parse_int, - split_host_port, uri, validate_directory, is_subpath) -from ..validator import IntValidator, EnumValidator +from ..utils import compare_values, is_subpath, maybe_convert_from_base_unit, \ + parse_bool, parse_int, split_host_port, uri, validate_directory +from ..validator import EnumValidator, IntValidator +from .validator import recovery_parameters, transform_postgresql_parameter_value, transform_recovery_parameter_value if TYPE_CHECKING: # pragma: no cover from . import Postgresql diff --git a/patroni/postgresql/connection.py b/patroni/postgresql/connection.py index 040dcf78d..b48d1da13 100644 --- a/patroni/postgresql/connection.py +++ b/patroni/postgresql/connection.py @@ -2,7 +2,8 @@ from contextlib import contextmanager from threading import Lock -from typing import Any, Dict, Iterator, List, Optional, Union, Tuple, TYPE_CHECKING +from typing import Any, Dict, Iterator, List, Optional, Tuple, TYPE_CHECKING, Union + if TYPE_CHECKING: # pragma: no cover from psycopg import Connection, Cursor from psycopg2 import connection, cursor diff --git a/patroni/postgresql/mpp/__init__.py b/patroni/postgresql/mpp/__init__.py index 5120db35a..b29d9f01e 100644 --- a/patroni/postgresql/mpp/__init__.py +++ b/patroni/postgresql/mpp/__init__.py @@ -5,15 +5,15 @@ """ import abc -from typing import Any, Dict, Iterator, Optional, Union, Tuple, Type, TYPE_CHECKING +from typing import Any, Dict, Iterator, Optional, Tuple, Type, TYPE_CHECKING, Union from ...dcs import Cluster from ...dynamic_loader import iter_classes from ...exceptions import PatroniException if TYPE_CHECKING: # pragma: no cover - from .. import Postgresql from ...config import Config + from .. import Postgresql class AbstractMPP(abc.ABC): diff --git a/patroni/postgresql/mpp/citus.py b/patroni/postgresql/mpp/citus.py index e287ea67b..d4b26efeb 100644 --- a/patroni/postgresql/mpp/citus.py +++ b/patroni/postgresql/mpp/citus.py @@ -3,13 +3,13 @@ import time from threading import Condition, Event, Thread +from typing import Any, Collection, Dict, Iterator, List, Optional, Set, Tuple, TYPE_CHECKING, Union from urllib.parse import urlparse -from typing import Any, Dict, List, Optional, Union, Tuple, TYPE_CHECKING -from . import AbstractMPP, AbstractMPPHandler from ...dcs import Cluster -from ...psycopg import connect, quote_ident, ProgrammingError +from ...psycopg import connect, ProgrammingError, quote_ident from ...utils import parse_int +from . import AbstractMPP, AbstractMPPHandler if TYPE_CHECKING: # pragma: no cover from .. import Postgresql @@ -19,19 +19,312 @@ logger = logging.getLogger(__name__) -class PgDistNode(object): - """Represents a single row in the `pg_dist_node` table""" +class PgDistNode: + """Represents a single row in "pg_dist_node" table. - def __init__(self, group: int, host: str, port: int, event: str, nodeid: Optional[int] = None, - timeout: Optional[float] = None, cooldown: Optional[float] = None) -> None: - self.group = group - # A weird way of pausing client connections by adding the `-demoted` suffix to the hostname - self.host = host + ('-demoted' if event == 'before_demote' else '') + .. note:: + + Unlike "noderole" possible values of ``role`` are ``primary``, ``secondary``, and ``demoted``. + The last one is used to pause client connections on the coordinator to the worker by + appending ``-demoted`` suffix to the "nodename". The actual "noderole" in DB remains ``primary``. + + :ivar host: "nodename" value + :ivar port: "nodeport" value + :ivar role: "noderole" value + :ivar nodeid: "nodeid" value + """ + + def __init__(self, host: str, port: int, role: str, nodeid: Optional[int] = None) -> None: + """Create a :class:`PgDistNode` object based on given arguments. + + :param host: "nodename" of the Citus coordinator or worker. + :param port: "nodeport" of the Citus coordinator or worker. + :param role: "noderole" value. + :param nodeid: id of the row in the "pg_dist_node". + """ + self.host = host self.port = port + self.role = role + self.nodeid = nodeid + + def __hash__(self) -> int: + """Defines a hash function to put :class:`PgDistNode` objects to :class:`PgDistGroup` set-like object. + + .. note:: + We use (:attr:`host`, :attr:`port`) tuple here because it is one of the UNIQUE constraints on the + "pg_dist_node" table. The :attr:`role` value is irrelevant here because nodes may change their roles. + """ + return hash((self.host, self.port)) + + def __eq__(self, other: Any) -> bool: + """Defines a comparison function. + + :returns: ``True`` if :attr:`host` and :attr:`port` between two instances are the same. + """ + return isinstance(other, PgDistNode) and self.host == other.host and self.port == other.port + + def __str__(self) -> str: + return ('PgDistNode(nodeid={0},host={1},port={2},role={3})' + .format(self.nodeid, self.host, self.port, self.role)) + + def __repr__(self) -> str: + return str(self) + + def is_primary(self) -> bool: + """Checks whether this object represents "primary" in a corresponding group. + + :returns: ``True`` if this object represents the ``primary``. + """ + return self.role in ('primary', 'demoted') + + def as_tuple(self, include_nodeid: bool = False) -> Tuple[str, int, str, Optional[int]]: + """Helper method to compare two :class:`PgDistGroup` objects. + + .. note:: + + *include_nodeid* is set to ``True`` only in unit-tests. + + :param include_nodeid: whether :attr:`nodeid` should be taken into account when comparison is performed. + + :returns: :class:`tuple` object with :attr:`host`, :attr:`port`, :attr:`role`, and optionally :attr:`nodeid`. + """ + return self.host, self.port, self.role, (self.nodeid if include_nodeid else None) + + +class PgDistGroup(Set[PgDistNode]): + """A :class:`set`-like object that represents a Citus group in "pg_dist_node" table. + + This class implements a set of methods to compare topology and if it is necessary + to transition from the old to the new topology in a "safe" manner: + + * register new primary/secondaries + * replace gone secondaries with added secondaries + * failover and switchover + + Typically there will be at least one :class:`PgDistNode` object registered (``primary``). + In addition to that there could be one or more ``secondary`` nodes. + + :ivar failover: whether the ``primary`` row should be updated as a result of :func:`transition` method call. + :ivar groupid: the "groupid" from "pg_dist_node". + """ + + def __init__(self, groupid: int, nodes: Optional[Collection[PgDistNode]] = None) -> None: + """Creates a :class:`PgDistGroup` object based on given arguments. + + :param groupid: the groupid from "pg_dist_node". + :param nodes: a collection of :class:`PgDistNode` objects that belog to a *groupid*. + """ + self.failover = False + self.groupid = groupid + + if nodes: + self.update(nodes) + + def equals(self, other: 'PgDistGroup', check_nodeid: bool = False) -> bool: + """Compares two :class:`PgDistGroup` objects. + + :param other: what we want to compare with. + :param check_nodeid: whether :attr:`PgDistNode.nodeid` should be compared in addition to + :attr:`PgDistNode.host`, :attr:`PgDistNode.port`, and :attr:`PgDistNode.role`. + + :returns: ``True`` if two :class:`PgDistGroup` objects are fully identical. + """ + return self.groupid == other.groupid\ + and set(v.as_tuple(check_nodeid) for v in self) == set(v.as_tuple(check_nodeid) for v in other) + + def primary(self) -> Optional[PgDistNode]: + """Finds and returns :class:`PgDistNode` object that represents the "primary". + + :returns: :class:`PgDistNode` object which represents the "primary" or ``None`` if not found. + """ + return next(iter(v for v in self if v.is_primary()), None) + + def get(self, value: PgDistNode) -> Optional[PgDistNode]: + """Performs a lookup of the actual value in a set. + + .. note:: + It is necessary because :func:`__hash__` and :func:`__eq__` methods in :class:`PgDistNode` are + redefined and effectively they check only :attr:`PgDistNode.host` and :attr:`PgDistNode.port` attributes. + + :param value: the key we search for. + :returns: the actual :class:`PgDistNode` value from this :class:`PgDistGroup` object or ``None`` if not found. + """ + return next(iter(v for v in self if v == value), None) + + def transition(self, old: 'PgDistGroup') -> Iterator[PgDistNode]: + """Compares this topology with the old one and yields transitions that transform the old to the new one. + + .. note:: + The actual yielded object is :class:`PgDistNode` that will be passed to + the :meth:`CitusHandler.update_node` to execute all transitions in a transaction. + + In addition to the yielding transactions this method fills up :attr:`PgDistNode.nodeid` + attribute for nodes that are presented in the old and in the new topology. + + There are a few simple rules/constraints that are imposed by Citus and must be followed: + - adding/removing nodes is only possible when metadata is synced to all registered "priorities". + + - the "primary" row in "pg_dist_node" always keeps the nodeid (unless it is + removed, but it is not supported by Patroni). + + - "nodename", "nodeport" must be unique across all rows in the "pg_dist_node". This means that + every time we want to change the nodeid of an existing node (i.e. to change it from secondary + to primary), we should first write some other "nodename"/"nodeport" to the row it's currently in. + + - updating "broken" nodes always works and metadata is synced asynchnonously after the commit. + + Following these rules below is an example of the switchover between node1 (primary, nodeid=4) + and node2 (secondary, nodeid=5). + + .. code-block:: SQL + + BEGIN; + SELECT citus_update_node(4, 'node1-demoted', 5432); + SELECT citus_update_node(5, 'node1', 5432); + SELECT citus_update_node(4, 'node2', 5432); + COMMIT; + + :param old: the last known topology registered in "pg_dist_node" for a given :attr:`groupid`. + + :yields: :class:`PgDistNode` objects that must be updated/added/removed in "pg_dist_node". + """ + self.failover = old.failover + + new_primary = self.primary() + assert new_primary is not None + old_primary = old.primary() + + gone_nodes = old - self - {old_primary} + added_nodes = self - old - {new_primary} + + if not old_primary: + # We did not have any nodes in the group yet and we're adding one now + yield new_primary + elif old_primary == new_primary: + new_primary.nodeid = old_primary.nodeid + # Controlled switchover with pausing client connections. + # Achieved by updating the primary row and putting hostname = '${host}-demoted' in a transaction. + if old_primary.role != new_primary.role: + self.failover = True + yield new_primary + elif old_primary != new_primary: + self.failover = True + + new_primary_old_node = old.get(new_primary) + old_primary_new_node = self.get(old_primary) + + # The new primary was registered as a secondary before failover + if new_primary_old_node: + new_node = None + # Old primary is gone and some new secondaries were added. + # We can use the row of promoted secondary to add the new secondary. + if not old_primary_new_node and added_nodes: + new_node = added_nodes.pop() + new_node.nodeid = new_primary_old_node.nodeid + yield new_node + + # notify _maybe_register_old_primary_as_secondary that the old primary should not be re-registered + old_primary.role = 'secondary' + # In opposite case we need to change the primary record to '${host}-demoted:${port}' + # before we can put its host:port to the row of promoted secondary. + elif old_primary.role == 'primary': + old_primary.role = 'demoted' + yield old_primary + + # The old primary is gone and the promoted secondary row wasn't yet used. + if not old_primary_new_node and not new_node: + # We have to "add" the gone primary to the row of promoted secondary because + # nodes could not be removed while the metadata isn't synced. + old_primary_new_node = PgDistNode(old_primary.host, old_primary.port, new_primary_old_node.role) + self.add(old_primary_new_node) + + # put the old primary instead of promoted secondary + if old_primary_new_node: + old_primary_new_node.nodeid = new_primary_old_node.nodeid + yield old_primary_new_node + + # update the primary record with the new information + new_primary.nodeid = old_primary.nodeid + yield new_primary + + # The new primary was never registered as a standby and there are secondaries that have gone away. Since + # nodes can't be removed while metadata isn't synced we have to temporarily "add" the old primary back. + if not new_primary_old_node and gone_nodes: + # We were in the middle of controlled switchover while the primary disappeared. + # If there are any gone nodes that can't be reused for new secondaries we will + # use one of them to temporarily "add" the old primary back as a secondary. + if not old_primary_new_node and old_primary.role == 'demoted' and len(gone_nodes) > len(added_nodes): + old_primary_new_node = PgDistNode(old_primary.host, old_primary.port, 'secondary') + self.add(old_primary_new_node) + + # Use one of the gone secondaries to put host:port of the old primary there. + if old_primary_new_node: + old_primary_new_node.nodeid = gone_nodes.pop().nodeid + yield old_primary_new_node + + # Fill nodeid for standbys in the new topology from the old ones + old_replicas = {v: v for v in old if not v.is_primary()} + for n in self: + if not n.is_primary() and not n.nodeid and n in old_replicas: + n.nodeid = old_replicas[n].nodeid + + # Reuse nodeid's of gone standbys to "add" new standbys + while gone_nodes and added_nodes: + a = added_nodes.pop() + a.nodeid = gone_nodes.pop().nodeid + yield a + + # Adding or removing nodes operations are executed on primaries in all Citus groups in 2PC. + # If we know that the primary was updated (self.failover is True) that automatically means that + # adding/removing nodes calls will fail and the whole transaction will be aborted. Therefore + # we discard operations that add/remove secondaries if we know that the primary was just updated. + # The inconsistency will be automatically resolved on the next Patroni heartbeat loop. + + # Remove remaining nodes that are gone, but only in case if metadata is in sync (self.failover is False). + for g in gone_nodes: + if not self.failover: + # Remove the node if we expect metadata to be in sync + yield PgDistNode(g.host, g.port, '') + else: + # Otherwise add these nodes to the new topology + self.add(g) + + # Add new nodes to the metadata, but only in case if metadata is in sync (self.failover is False). + for a in added_nodes: + if not self.failover: + # Add the node if we expect metadata to be in sync + yield a + else: + # Otherwise remove them from the new topology + self.discard(a) + + +class PgDistTask(PgDistGroup): + """A "task" that represents the current or desired state of "pg_dist_node" for a provided *groupid*. + + :ivar group: the "groupid" in "pg_dist_node". + :ivar event: an "event" that resulted in creating this task. + possible values: "before_demote", "before_promote", "after_promote". + :ivar timeout: a transaction timeout if the task resulted in starting a transaction. + :ivar cooldown: the cooldown value for ``citus_update_node()`` UDF call. + :ivar deadline: the time in unix seconds when the transaction is allowed to be rolled back. + """ + + def __init__(self, groupid: int, nodes: Optional[Collection[PgDistNode]], event: str, + timeout: Optional[float] = None, cooldown: Optional[float] = None) -> None: + """Create a :class:`PgDistTask` object based on given arguments. + + :param groupid: the groupid from "pg_dist_node". + :param nodes: a collection of :class:`PgDistNode` objects that belog to a *groupid*. + :param event: an "event" that resulted in creating this task. + :param timeout: a transaction timeout if the task resulted in starting a transaction. + :param cooldown: the cooldown value for ``citus_update_node()`` UDF call. + """ + super(PgDistTask, self).__init__(groupid, nodes) + # Event that is trying to change or changed the given row. # Possible values: before_demote, before_promote, after_promote. self.event = event - self.nodeid = nodeid # If transaction was started, we need to COMMIT/ROLLBACK before the deadline self.timeout = timeout @@ -46,25 +339,20 @@ def __init__(self, group: int, host: str, port: int, event: str, nodeid: Optiona self._event = Event() def wait(self) -> None: + """Wait until this task is processed by a dedicated thread.""" self._event.wait() def wakeup(self) -> None: + """Notify a thread that created a task that it was processed.""" self._event.set() def __eq__(self, other: Any) -> bool: - return isinstance(other, PgDistNode) and self.event == other.event\ - and self.host == other.host and self.port == other.port + return isinstance(other, PgDistTask) and self.event == other.event\ + and super(PgDistTask, self).equals(other) def __ne__(self, other: Any) -> bool: return not self == other - def __str__(self) -> str: - return ('PgDistNode(nodeid={0},group={1},host={2},port={3},event={4})' - .format(self.nodeid, self.group, self.host, self.port, self.event)) - - def __repr__(self) -> str: - return str(self) - class Citus(AbstractMPP): @@ -109,11 +397,11 @@ def __init__(self, postgresql: 'Postgresql', config: Dict[str, Union[str, int]]) self._connection = postgresql.connection_pool.get( 'citus', {'dbname': config['database'], 'options': '-c statement_timeout=0 -c idle_in_transaction_session_timeout=0'}) - self._pg_dist_node: Dict[int, PgDistNode] = {} # Cache of pg_dist_node: {groupid: PgDistNode()} - self._tasks: List[PgDistNode] = [] # Requests to change pg_dist_node, every task is a `PgDistNode` - self._in_flight: Optional[PgDistNode] = None # Reference to the `PgDistNode` being changed in a transaction - self._schedule_load_pg_dist_node = True # Flag that "pg_dist_node" should be queried from the database - self._condition = Condition() # protects _pg_dist_node, _tasks, _in_flight, and _schedule_load_pg_dist_node + self._pg_dist_group: Dict[int, PgDistTask] = {} # Cache of pg_dist_node: {groupid: PgDistTask()} + self._tasks: List[PgDistTask] = [] # Requests to change pg_dist_group, every task is a `PgDistTask` + self._in_flight: Optional[PgDistTask] = None # Reference to the `PgDistTask` being changed in a transaction + self._schedule_load_pg_dist_group = True # Flag that "pg_dist_group" should be queried from the database + self._condition = Condition() # protects _pg_dist_group, _tasks, _in_flight, and _schedule_load_pg_dist_group self.schedule_cache_rebuild() def schedule_cache_rebuild(self) -> None: @@ -122,12 +410,12 @@ def schedule_cache_rebuild(self) -> None: Is called to notify handler that it has to refresh its metadata cache from the database. """ with self._condition: - self._schedule_load_pg_dist_node = True + self._schedule_load_pg_dist_group = True def on_demote(self) -> None: with self._condition: - self._pg_dist_node.clear() - empty_tasks: List[PgDistNode] = [] + self._pg_dist_group.clear() + empty_tasks: List[PgDistTask] = [] self._tasks[:] = empty_tasks self._in_flight = None @@ -143,22 +431,28 @@ def query(self, sql: str, *params: Any) -> List[Tuple[Any, ...]]: self.schedule_cache_rebuild() raise e - def load_pg_dist_node(self) -> bool: + def load_pg_dist_group(self) -> bool: """Read from the `pg_dist_node` table and put it into the local cache""" with self._condition: - if not self._schedule_load_pg_dist_node: + if not self._schedule_load_pg_dist_group: return True - self._schedule_load_pg_dist_node = False + self._schedule_load_pg_dist_group = False try: - rows = self.query("SELECT nodeid, groupid, nodename, nodeport, noderole" - " FROM pg_catalog.pg_dist_node WHERE noderole = 'primary'") + rows = self.query('SELECT groupid, nodename, nodeport, noderole, nodeid FROM pg_catalog.pg_dist_node') except Exception: return False + pg_dist_group: Dict[int, PgDistTask] = {} + + for row in rows: + if row[0] not in pg_dist_group: + pg_dist_group[row[0]] = PgDistTask(row[0], nodes=set(), event='after_promote') + pg_dist_group[row[0]].add(PgDistNode(*row[1:])) + with self._condition: - self._pg_dist_node = {r[1]: PgDistNode(r[1], r[2], r[3], 'after_promote', r[0]) for r in rows} + self._pg_dist_group = pg_dist_group return True def sync_meta_data(self, cluster: Cluster) -> None: @@ -166,7 +460,7 @@ def sync_meta_data(self, cluster: Cluster) -> None: We can't always rely on REST API calls from worker nodes in order to maintain `pg_dist_node`, therefore at least once per heartbeat - loop we make sure that workes registered in `self._pg_dist_node` + loop we make sure that workes registered in `self._pg_dist_group` cache are matching the cluster view from DCS by creating tasks the same way as it is done from the REST API.""" @@ -177,20 +471,21 @@ def sync_meta_data(self, cluster: Cluster) -> None: if not self.is_alive(): self.start() - self.add_task('after_promote', CITUS_COORDINATOR_GROUP_ID, self._postgresql.connection_string) + self.add_task('after_promote', CITUS_COORDINATOR_GROUP_ID, cluster, + self._postgresql.name, self._postgresql.connection_string) - for group, worker in cluster.workers.items(): + for groupid, worker in cluster.workers.items(): leader = worker.leader if leader and leader.conn_url\ and leader.data.get('role') in ('master', 'primary') and leader.data.get('state') == 'running': - self.add_task('after_promote', group, leader.conn_url) + self.add_task('after_promote', groupid, worker, leader.name, leader.conn_url) - def find_task_by_group(self, group: int) -> Optional[int]: + def find_task_by_groupid(self, groupid: int) -> Optional[int]: for i, task in enumerate(self._tasks): - if task.group == group: + if task.groupid == groupid: return i - def pick_task(self) -> Tuple[Optional[int], Optional[PgDistNode]]: + def pick_task(self) -> Tuple[Optional[int], Optional[PgDistTask]]: """Returns the tuple(i, task), where `i` - is the task index in the self._tasks list Tasks are picked by following priorities: @@ -198,44 +493,56 @@ def pick_task(self) -> Tuple[Optional[int], Optional[PgDistNode]]: 1. If there is already a transaction in progress, pick a task that that will change already affected worker primary. 2. If the coordinator address should be changed - pick a task - with group=0 (coordinators are always in group 0). + with groupid=0 (coordinators are always in groupid 0). 3. Pick a task that is the oldest (first from the self._tasks) """ with self._condition: if self._in_flight: - i = self.find_task_by_group(self._in_flight.group) + i = self.find_task_by_groupid(self._in_flight.groupid) else: while True: - i = self.find_task_by_group(CITUS_COORDINATOR_GROUP_ID) # set_coordinator + i = self.find_task_by_groupid(CITUS_COORDINATOR_GROUP_ID) # set_coordinator if i is None and self._tasks: i = 0 if i is None: break task = self._tasks[i] - if task == self._pg_dist_node.get(task.group): - self._tasks.pop(i) # nothing to do because cached version of pg_dist_node already matches + if task == self._pg_dist_group.get(task.groupid): + self._tasks.pop(i) # nothing to do because cached version of pg_dist_group already matches else: break task = self._tasks[i] if i is not None else None - # When tasks are added it could happen that self._pg_dist_node - # wasn't ready (self._schedule_load_pg_dist_node is False) - # and hence the nodeid wasn't filled. - if task and task.group in self._pg_dist_node: - task.nodeid = self._pg_dist_node[task.group].nodeid return i, task - def update_node(self, task: PgDistNode) -> None: - if task.nodeid is not None: + def update_node(self, groupid: int, node: PgDistNode, cooldown: float = 10000) -> None: + if node.role not in ('primary', 'secondary', 'demoted'): + self.query('SELECT pg_catalog.citus_remove_node(%s, %s)', node.host, node.port) + elif node.nodeid is not None: + host = node.host + ('-demoted' if node.role == 'demoted' else '') self.query('SELECT pg_catalog.citus_update_node(%s, %s, %s, true, %s)', - task.nodeid, task.host, task.port, task.cooldown) - elif task.event != 'before_demote': - task.nodeid = self.query("SELECT pg_catalog.citus_add_node(%s, %s, %s, 'primary', 'default')", - task.host, task.port, task.group)[0][0] + node.nodeid, host, node.port, cooldown) + elif node.role != 'demoted': + node.nodeid = self.query("SELECT pg_catalog.citus_add_node(%s, %s, %s, %s, 'default')", + node.host, node.port, groupid, node.role)[0][0] + + def update_group(self, task: PgDistTask, transaction: bool) -> None: + current_state = self._in_flight\ + or self._pg_dist_group.get(task.groupid)\ + or PgDistTask(task.groupid, set(), 'after_promote') + transitions = list(task.transition(current_state)) + if transitions: + if not transaction and len(transitions) > 1: + self.query('BEGIN') + for node in transitions: + self.update_node(task.groupid, node, task.cooldown) + if not transaction and len(transitions) > 1: + task.failover = False + self.query('COMMIT') - def process_task(self, task: PgDistNode) -> bool: - """Updates a single row in `pg_dist_node` table, optionally in a transaction. + def process_task(self, task: PgDistTask) -> bool: + """Updates a single row in `pg_dist_group` table, optionally in a transaction. The transaction is started if we do a demote of the worker node or before promoting the other worker if there is no transaction in progress. And, the transaction is committed when the switchover/failover completed. @@ -246,34 +553,30 @@ def process_task(self, task: PgDistNode) -> bool: .. note: Read access to `self._in_flight` isn't protected because we know it can't be changed outside of our thread. - :param task: reference to a :class:`PgDistNode` object that represents a row to be updated/created. - :returns: `True` if the row was succesfully created/updated or transaction in progress - was committed as an indicator that the `self._pg_dist_node` cache should be updated, + :param task: reference to a :class:`PgDistTask` object that represents a row to be updated/created. + :returns: ``True`` if the row was succesfully created/updated or transaction in progress + was committed as an indicator that the `self._pg_dist_group` cache should be updated, or, if the new transaction was opened, this method returns `False`. """ if task.event == 'after_promote': - # The after_promote may happen without previous before_demote and/or - # before_promore. In this case we just call self.update_node() method. - # If there is a transaction in progress, it could be that it already did - # required changes and we can simply COMMIT. - if not self._in_flight or self._in_flight.host != task.host or self._in_flight.port != task.port: - self.update_node(task) + self.update_group(task, self._in_flight is not None) if self._in_flight: self.query('COMMIT') + task.failover = False return True else: # before_demote, before_promote if task.timeout: task.deadline = time.time() + task.timeout if not self._in_flight: self.query('BEGIN') - self.update_node(task) + self.update_group(task, True) return False def process_tasks(self) -> None: while True: # Read access to `_in_flight` isn't protected because we know it can't be changed outside of our thread. - if not self._in_flight and not self.load_pg_dist_node(): + if not self._in_flight and not self.load_pg_dist_group(): break i, task = self.pick_task() @@ -287,7 +590,7 @@ def process_tasks(self) -> None: with self._condition: if self._tasks: if update_cache: - self._pg_dist_node[task.group] = task + self._pg_dist_group[task.groupid] = task if update_cache is False: # an indicator that process_tasks has started a transaction self._in_flight = task @@ -302,7 +605,7 @@ def run(self) -> None: while True: try: with self._condition: - if self._schedule_load_pg_dist_node: + if self._schedule_load_pg_dist_group: timeout = -1 elif self._in_flight: timeout = self._in_flight.deadline - time.time() if self._tasks else None @@ -319,9 +622,9 @@ def run(self) -> None: except Exception: logger.exception('run') - def _add_task(self, task: PgDistNode) -> bool: + def _add_task(self, task: PgDistTask) -> bool: with self._condition: - i = self.find_task_by_group(task.group) + i = self.find_task_by_groupid(task.groupid) # The `PgDistNode.timeout` == None is an indicator that it was scheduled from the sync_meta_data(). if task.timeout is None: @@ -333,37 +636,48 @@ def _add_task(self, task: PgDistNode) -> bool: # key is updated in DCS. Therefore it is possible that :func:`sync_meta_data` will try to create a task # based on the outdated values of "state"/"role". To solve it we introduce an artificial timeout. # Only when the timeout is reached new tasks could be scheduled from sync_meta_data() - if self._in_flight and self._in_flight.group == task.group and self._in_flight.timeout is not None\ + if self._in_flight and self._in_flight.groupid == task.groupid and self._in_flight.timeout is not None\ and self._in_flight.deadline > time.time(): return False - # Override already existing task for the same worker group + # Override already existing task for the same worker groupid if i is not None: if task != self._tasks[i]: logger.debug('Overriding existing task: %s != %s', self._tasks[i], task) self._tasks[i] = task self._condition.notify() return True - # Add the task to the list if Worker node state is different from the cached `pg_dist_node` - elif self._schedule_load_pg_dist_node or task != self._pg_dist_node.get(task.group)\ - or self._in_flight and task.group == self._in_flight.group: + # Add the task to the list if Worker node state is different from the cached `pg_dist_group` + elif self._schedule_load_pg_dist_group or task != self._pg_dist_group.get(task.groupid)\ + or self._in_flight and task.groupid == self._in_flight.groupid: logger.debug('Adding the new task: %s', task) self._tasks.append(task) self._condition.notify() return True return False - def add_task(self, event: str, group: int, conn_url: str, - timeout: Optional[float] = None, cooldown: Optional[float] = None) -> Optional[PgDistNode]: + @staticmethod + def _pg_dist_node(role: str, conn_url: str) -> Optional[PgDistNode]: try: r = urlparse(conn_url) + if r.hostname: + return PgDistNode(r.hostname, r.port or 5432, role) except Exception as e: - return logger.error('Failed to parse connection url %s: %r', conn_url, e) - host = r.hostname - if host: - port = r.port or 5432 - task = PgDistNode(group, host, port, event, timeout=timeout, cooldown=cooldown) - return task if self._add_task(task) else None + logger.error('Failed to parse connection url %s: %r', conn_url, e) + + def add_task(self, event: str, groupid: int, cluster: Cluster, leader_name: str, leader_url: str, + timeout: Optional[float] = None, cooldown: Optional[float] = None) -> Optional[PgDistTask]: + primary = self._pg_dist_node('demoted' if event == 'before_demote' else 'primary', leader_url) + if not primary: + return + + task = PgDistTask(groupid, {primary}, event=event, timeout=timeout, cooldown=cooldown) + for member in cluster.members: + secondary = self._pg_dist_node('secondary', member.conn_url)\ + if member.name != leader_name and member.is_running and member.conn_url else None + if secondary: + task.add(secondary) + return task if self._add_task(task) else None def handle_event(self, cluster: Cluster, event: Dict[str, Any]) -> None: if not self.is_alive(): @@ -371,10 +685,10 @@ def handle_event(self, cluster: Cluster, event: Dict[str, Any]) -> None: worker = cluster.workers.get(event['group']) if not (worker and worker.leader and worker.leader.name == event['leader'] and worker.leader.conn_url): - return + return logger.info('Discarding event %s', event) - task = self.add_task(event['type'], event['group'], - worker.leader.conn_url, + task = self.add_task(event['type'], event['group'], worker, + worker.leader.name, worker.leader.conn_url, event['timeout'], event['cooldown'] * 1000) if task and event['type'] == 'before_demote': task.wait() diff --git a/patroni/postgresql/postmaster.py b/patroni/postgresql/postmaster.py index 359067c24..ea2187782 100644 --- a/patroni/postgresql/postmaster.py +++ b/patroni/postgresql/postmaster.py @@ -1,16 +1,17 @@ import logging import multiprocessing import os -import psutil import re import signal import subprocess import sys from multiprocessing.connection import Connection -from typing import Dict, Optional, List +from typing import Dict, List, Optional + +import psutil -from patroni import PATRONI_ENV_PREFIX, KUBERNETES_ENV_PREFIX +from patroni import KUBERNETES_ENV_PREFIX, PATRONI_ENV_PREFIX # avoid spawning the resource tracker process if sys.version_info >= (3, 8): # pragma: no cover diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py index 91cddd59c..762e65e2a 100644 --- a/patroni/postgresql/rewind.py +++ b/patroni/postgresql/rewind.py @@ -7,14 +7,14 @@ from enum import IntEnum from threading import Lock, Thread -from typing import Any, Callable, Dict, List, Optional, Union, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from . import Postgresql -from .connection import get_connection_cursor -from .misc import format_lsn, fsync_dir, parse_history, parse_lsn from ..async_executor import CriticalTask from ..collections import EMPTY_DICT from ..dcs import Leader, RemoteMember +from . import Postgresql +from .connection import get_connection_cursor +from .misc import format_lsn, fsync_dir, parse_history, parse_lsn logger = logging.getLogger(__name__) diff --git a/patroni/postgresql/slots.py b/patroni/postgresql/slots.py index e8ae8a18b..9e5216fbb 100644 --- a/patroni/postgresql/slots.py +++ b/patroni/postgresql/slots.py @@ -6,22 +6,24 @@ import logging import os import shutil + from collections import defaultdict from contextlib import contextmanager from threading import Condition, Thread -from typing import Any, Dict, Iterator, List, Optional, Union, Tuple, TYPE_CHECKING, Collection +from typing import Any, Collection, Dict, Iterator, List, Optional, Tuple, TYPE_CHECKING, Union -from .connection import get_connection_cursor -from .misc import format_lsn, fsync_dir from .. import global_config from ..dcs import Cluster, Leader from ..file_perm import pg_perm from ..psycopg import OperationalError from ..tags import Tags +from .connection import get_connection_cursor +from .misc import format_lsn, fsync_dir if TYPE_CHECKING: # pragma: no cover from psycopg import Cursor from psycopg2 import cursor + from . import Postgresql logger = logging.getLogger(__name__) diff --git a/patroni/postgresql/sync.py b/patroni/postgresql/sync.py index 5b312414e..b4c6bf050 100644 --- a/patroni/postgresql/sync.py +++ b/patroni/postgresql/sync.py @@ -3,12 +3,13 @@ import time from copy import deepcopy -from typing import Collection, List, NamedTuple, Tuple, TYPE_CHECKING +from typing import Collection, List, NamedTuple, Optional, TYPE_CHECKING from .. import global_config from ..collections import CaseInsensitiveDict, CaseInsensitiveSet from ..dcs import Cluster from ..psycopg import quote_ident as _quote_ident + if TYPE_CHECKING: # pragma: no cover from . import Postgresql @@ -138,7 +139,7 @@ def parse_sync_standby_names(value: str) -> _SSN: if len(synclist) == i + 1: # except the last token raise ValueError("Unparseable synchronous_standby_names value %r: Unexpected token %s %r at %d" % (value, a_type, a_value, a_pos)) - elif a_type != 'comma': + if a_type != 'comma': raise ValueError("Unparseable synchronous_standby_names value %r: ""Got token %s %r while" " expecting comma at %d" % (value, a_type, a_value, a_pos)) elif a_type in {'ident', 'first', 'any'}: @@ -154,6 +155,26 @@ def parse_sync_standby_names(value: str) -> _SSN: return _SSN(sync_type, has_star, num, members) +class _SyncState(NamedTuple): + """Class representing the current synchronous state. + + :ivar sync_type: possible values: ``off``, ``priority``, ``quorum`` + :ivar numsync: how many nodes are required to be synchronous (according to ``synchronous_standby_names``). + Is ``0`` if ``synchronous_standby_names`` value is invalid or contains ``*``. + :ivar numsync_confirmed: how many nodes are known to be synchronous according to the ``pg_stat_replication`` view. + Only nodes that caught up with the :attr:`SyncHandler._primary_flush_lsn` are counted. + :ivar sync: collection of synchronous node names. In case of quorum commit all nodes listed + in ``synchronous_standby_names``, otherwise nodes that are confirmed to be synchronous according + to the ``pg_stat_replication`` view. + :ivar active: collection of node names that are streaming and have no restrictions to become synchronous. + """ + sync_type: str + numsync: int + numsync_confirmed: int + sync: CaseInsensitiveSet + active: CaseInsensitiveSet + + class _Replica(NamedTuple): """Class representing a single replica that is eligible to be synchronous. @@ -217,6 +238,8 @@ def __init__(self, postgresql: 'Postgresql', cluster: Cluster) -> None: # Prefer replicas that are in state ``sync`` and with higher values of ``write``/``flush``/``replay`` LSN. self.sort(key=lambda r: (r.sync_state, r.lsn), reverse=True) + # When checking ``maximum_lag_on_syncnode`` we want to compare with the most + # up-to-date replica otherwise with cluster LSN if there is only one replica. self.max_lsn = max(self, key=lambda x: x.lsn).lsn if len(self) > 1 else postgresql.last_operation() @@ -278,12 +301,22 @@ def _process_replica_readiness(self, cluster: Cluster, replica_list: _ReplicaLis # if standby name is listed in the /sync key we can count it as synchronous, otherwise # it becomes really synchronous when sync_state = 'sync' and it is known that it managed to catch up if replica.application_name not in self._ready_replicas\ - and replica.application_name in self._ssn_data.members\ - and (cluster.sync.matches(replica.application_name) - or replica.sync_state == 'sync' and replica.lsn >= self._primary_flush_lsn): - self._ready_replicas[replica.application_name] = replica.pid - - def current_state(self, cluster: Cluster) -> Tuple[CaseInsensitiveSet, CaseInsensitiveSet]: + and replica.application_name in self._ssn_data.members: + if global_config.is_quorum_commit_mode: + # When quorum commit is enabled we can't check against cluster.sync because nodes + # are written there when at least one of them caught up with _primary_flush_lsn. + if replica.lsn >= self._primary_flush_lsn\ + and (replica.sync_state == 'quorum' + or (not self._postgresql.supports_quorum_commit + and replica.sync_state in ('sync', 'potential'))): + self._ready_replicas[replica.application_name] = replica.pid + elif cluster.sync.matches(replica.application_name)\ + or replica.sync_state == 'sync' and replica.lsn >= self._primary_flush_lsn: + # if standby name is listed in the /sync key we can count it as synchronous, otherwise it becomes + # "really" synchronous when sync_state = 'sync' and we known that it managed to catch up + self._ready_replicas[replica.application_name] = replica.pid + + def current_state(self, cluster: Cluster) -> _SyncState: """Find the best candidates to be the synchronous standbys. Current synchronous standby is always preferred, unless it has disconnected or does not want to be a @@ -291,51 +324,86 @@ def current_state(self, cluster: Cluster) -> Tuple[CaseInsensitiveSet, CaseInsen Standbys are selected based on values from the global configuration: - - `maximum_lag_on_syncnode`: would help swapping unhealthy sync replica in case if it stops - responding (or hung). Please set the value high enough so it won't unncessarily swap sync - standbys during high loads. Any value less or equal of 0 keeps the behavior backward compatible. - Please note that it will not also swap sync standbys in case where all replicas are hung. - - `synchronous_node_count`: controlls how many nodes should be set as synchronous. + - ``maximum_lag_on_syncnode``: would help swapping unhealthy sync replica in case it stops + responding (or hung). Please set the value high enough, so it won't unnecessarily swap sync + standbys during high loads. Any value less or equal to ``0`` keeps the behavior backwards compatible. + Please note that it will also not swap sync standbys when all replicas are hung. + + - ``synchronous_node_count``: controls how many nodes should be set as synchronous. - :returns: tuple of candidates :class:`CaseInsensitiveSet` and synchronous standbys :class:`CaseInsensitiveSet`. + :param cluster: current cluster topology from DCS + + :returns: current synchronous replication state as a :class:`_SyncState` object """ self._handle_synchronous_standby_names_change() replica_list = _ReplicaList(self._postgresql, cluster) self._process_replica_readiness(cluster, replica_list) + active = CaseInsensitiveSet() + sync_nodes = CaseInsensitiveSet() + numsync_confirmed = 0 + sync_node_count = global_config.synchronous_node_count if self._postgresql.supports_multiple_sync else 1 sync_node_maxlag = global_config.maximum_lag_on_syncnode - candidates = CaseInsensitiveSet() - sync_nodes = CaseInsensitiveSet() # Prefer members without nofailover tag. We are relying on the fact that sorts are guaranteed to be stable. for replica in sorted(replica_list, key=lambda x: x.nofailover): if sync_node_maxlag <= 0 or replica_list.max_lsn - replica.lsn <= sync_node_maxlag: - candidates.add(replica.application_name) - if replica.sync_state == 'sync' and replica.application_name in self._ready_replicas: - sync_nodes.add(replica.application_name) - if len(candidates) >= sync_node_count: - break - - return candidates, sync_nodes - - def set_synchronous_standby_names(self, sync: Collection[str]) -> None: - """Constructs and sets "synchronous_standby_names" GUC value. + if global_config.is_quorum_commit_mode: + # We do not add nodes with `nofailover` enabled because that reduces availability. + # We need to check LSN quorum only among nodes that are promotable because + # there is a chance that a non-promotable node is ahead of a promotable one. + if not replica.nofailover or len(active) < sync_node_count: + if replica.application_name in self._ready_replicas: + numsync_confirmed += 1 + active.add(replica.application_name) + else: + active.add(replica.application_name) + if replica.sync_state == 'sync' and replica.application_name in self._ready_replicas: + sync_nodes.add(replica.application_name) + numsync_confirmed += 1 + if len(active) >= sync_node_count: + break + + if global_config.is_quorum_commit_mode: + sync_nodes = CaseInsensitiveSet() if self._ssn_data.has_star else self._ssn_data.members + + return _SyncState( + self._ssn_data.sync_type, + 0 if self._ssn_data.has_star else self._ssn_data.num, + numsync_confirmed, + sync_nodes, + active) + + def set_synchronous_standby_names(self, sync: Collection[str], num: Optional[int] = None) -> None: + """Constructs and sets ``synchronous_standby_names`` GUC value. + + .. note:: + standbys in ``synchronous_standby_names`` will be sorted by name. :param sync: set of nodes to sync to + :param num: specifies number of nodes to sync to. The *num* is set only in case if quorum commit is enabled """ - has_asterisk = '*' in sync + # Special case. If sync nodes set is empty but requested num of sync nodes >= 1 + # we want to set synchronous_standby_names to '*' + has_asterisk = '*' in sync or num and num >= 1 and not sync if has_asterisk: sync = ['*'] else: - sync = [quote_ident(x) for x in sync] + sync = [quote_ident(x) for x in sorted(sync)] if self._postgresql.supports_multiple_sync and len(sync) > 1: - sync_param = '{0} ({1})'.format(len(sync), ','.join(sync)) + if num is None: + num = len(sync) + sync_param = ','.join(sync) else: sync_param = next(iter(sync), None) + if global_config.is_quorum_commit_mode and sync or self._postgresql.supports_multiple_sync and len(sync) > 1: + prefix = 'ANY ' if global_config.is_quorum_commit_mode and self._postgresql.supports_quorum_commit else '' + sync_param = f'{prefix}{num} ({sync_param})' + if not (self._postgresql.config.set_synchronous_standby_names(sync_param) and self._postgresql.state == 'running' and self._postgresql.is_primary()) or has_asterisk: return diff --git a/patroni/postgresql/validator.py b/patroni/postgresql/validator.py index f118c973b..66c5ada3e 100644 --- a/patroni/postgresql/validator.py +++ b/patroni/postgresql/validator.py @@ -1,14 +1,15 @@ import abc -from copy import deepcopy import logging -import yaml +from copy import deepcopy from typing import Any, Dict, Iterator, List, MutableMapping, Optional, Tuple, Type, Union -from .available_parameters import get_validator_files, PathLikeObj +import yaml + from ..collections import CaseInsensitiveDict, CaseInsensitiveSet from ..exceptions import PatroniException from ..utils import parse_bool, parse_int, parse_real +from .available_parameters import get_validator_files, PathLikeObj logger = logging.getLogger(__name__) diff --git a/patroni/psycopg.py b/patroni/psycopg.py index 5d7c6a377..a91519f3b 100644 --- a/patroni/psycopg.py +++ b/patroni/psycopg.py @@ -5,6 +5,7 @@ ``2.5.4``. """ from typing import Any, Optional, TYPE_CHECKING, Union + if TYPE_CHECKING: # pragma: no cover from psycopg import Connection from psycopg2 import connection, cursor @@ -14,10 +15,11 @@ _legacy = False try: from psycopg2 import __version__ + from . import MIN_PSYCOPG2, parse_version if parse_version(__version__) < MIN_PSYCOPG2: raise ImportError - from psycopg2 import connect as _connect, Error, DatabaseError, OperationalError, ProgrammingError + from psycopg2 import connect as _connect, DatabaseError, Error, OperationalError, ProgrammingError from psycopg2.extensions import adapt try: @@ -44,8 +46,9 @@ def quote_literal(value: Any, conn: Optional[Any] = None) -> str: except ImportError: import types + from psycopg import DatabaseError, Error, OperationalError, ProgrammingError, sql + # isort: off from psycopg import connect as __connect # pyright: ignore [reportUnknownVariableType] - from psycopg import sql, Error, DatabaseError, OperationalError, ProgrammingError def __get_parameter_status(self: 'Connection[Any]', param_name: str) -> Optional[str]: """Helper function to be injected into :class:`Connection` object. diff --git a/patroni/quorum.py b/patroni/quorum.py new file mode 100644 index 000000000..059c8acc7 --- /dev/null +++ b/patroni/quorum.py @@ -0,0 +1,431 @@ +"""Implement state machine to manage ``synchronous_standby_names`` GUC and ``/sync`` key in DCS.""" +import logging + +from typing import Collection, Iterator, NamedTuple, Optional + +from .collections import CaseInsensitiveSet +from .exceptions import PatroniException + +logger = logging.getLogger(__name__) + + +class Transition(NamedTuple): + """Object describing transition of ``/sync`` or ``synchronous_standby_names`` to the new state. + + .. note:: + Object attributes represent the new state. + + :ivar transition_type: possible values: + + * ``sync`` - indicates that we needed to update ``synchronous_standby_names``. + * ``quorum`` - indicates that we need to update ``/sync`` key in DCS. + * ``restart`` - caller should stop iterating over transitions and restart :class:`QuorumStateResolver`. + :ivar leader: the new value of the ``leader`` field in the ``/sync`` key. + :ivar num: the new value of the synchronous nodes count in ``synchronous_standby_names`` or value of the ``quorum`` + field in the ``/sync`` key for :attr:`transition_type` values ``sync`` and ``quorum`` respectively. + :ivar names: the new value of node names listed in ``synchronous_standby_names`` or value of ``voters`` + field in the ``/sync`` key for :attr:`transition_type` values ``sync`` and ``quorum`` respectively. + """ + + transition_type: str + leader: str + num: int + names: CaseInsensitiveSet + + +class QuorumError(PatroniException): + """Exception indicating that the quorum state is broken.""" + + +class QuorumStateResolver: + """Calculates a list of state transitions and yields them as :class:`Transition` named tuples. + + Synchronous replication state is set in two places: + + * PostgreSQL configuration sets how many and which nodes are needed for a commit to succeed, abbreviated as + ``numsync`` and ``sync`` set here; + * DCS contains information about how many and which nodes need to be interrogated to be sure to see an wal position + containing latest confirmed commit, abbreviated as ``quorum`` and ``voters`` set. + + .. note:: + Both of above pairs have the meaning "ANY n OF set". + + The number of nodes needed for commit to succeed, ``numsync``, is also called the replication factor. + + To guarantee zero transaction loss on failover we need to keep the invariant that at all times any subset of + nodes that can acknowledge a commit overlaps with any subset of nodes that can achieve quorum to promote a new + leader. Given a desired replication factor and a set of nodes able to participate in sync replication there + is one optimal state satisfying this condition. Given the node set ``active``, the optimal state is:: + + sync = voters = active + + numsync = min(sync_wanted, len(active)) + + quorum = len(active) - numsync + + We need to be able to produce a series of state changes that take the system to this desired state from any + other arbitrary state given arbitrary changes is node availability, configuration and interrupted transitions. + + To keep the invariant the rule to follow is that when increasing ``numsync`` or ``quorum``, we need to perform the + increasing operation first. When decreasing either, the decreasing operation needs to be performed later. In other + words: + + * If a user increases ``synchronous_node_count`` configuration, first we increase ``synchronous_standby_names`` + (``numsync``), then we decrease ``quorum`` field in the ``/sync`` key; + * If a user decreases ``synchronous_node_count`` configuration, first we increase ``quorum`` field in the ``/sync`` + key, then we decrease ``synchronous_standby_names`` (``numsync``). + + Order of adding or removing nodes from ``sync`` and ``voters`` depends on the state of + ``synchronous_standby_names``. + + When adding new nodes:: + + if ``sync`` (``synchronous_standby_names``) is empty: + add new nodes first to ``sync`` and then to ``voters`` when ``numsync_confirmed`` > ``0``. + else: + add new nodes first to ``voters`` and then to ``sync``. + + When removing nodes:: + + if ``sync`` (``synchronous_standby_names``) will become empty after removal: + first remove nodes from ``voters`` and then from ``sync``. + else: + first remove nodes from ``sync`` and then from ``voters``. + Make ``voters`` empty if ``numsync_confirmed`` == ``0``. + + :ivar leader: name of the leader, according to the ``/sync`` key. + :ivar quorum: ``quorum`` value from the ``/sync`` key, the minimal number of nodes we need see + when doing the leader race. + :ivar voters: ``sync_standby`` value from the ``/sync`` key, set of node names we will be + running the leader race against. + :ivar numsync: the number of synchronous nodes from the ``synchronous_standby_names``. + :ivar sync: set of node names listed in the ``synchronous_standby_names``. + :ivar numsync_confirmed: the number of nodes that are confirmed to reach "safe" LSN after they were added to the + ``synchronous_standby_names``. + :ivar active: set of node names that are replicating from the primary (according to ``pg_stat_replication``) + and are eligible to be listed in ``synchronous_standby_names``. + :ivar sync_wanted: desired number of synchronous nodes (``synchronous_node_count`` from the global configuration). + :ivar leader_wanted: the desired leader (could be different from the :attr:`leader` right after a failover). + """ + + def __init__(self, leader: str, quorum: int, voters: Collection[str], + numsync: int, sync: Collection[str], numsync_confirmed: int, + active: Collection[str], sync_wanted: int, leader_wanted: str) -> None: + """Instantiate :class:``QuorumStateResolver`` based on input parameters. + + :param leader: name of the leader, according to the ``/sync`` key. + :param quorum: ``quorum`` value from the ``/sync`` key, the minimal number of nodes we need see + when doing the leader race. + :param voters: ``sync_standby`` value from the ``/sync`` key, set of node names we will be + running the leader race against. + :param numsync: the number of synchronous nodes from the ``synchronous_standby_names``. + :param sync: Set of node names listed in the ``synchronous_standby_names``. + :param numsync_confirmed: the number of nodes that are confirmed to reach "safe" LSN after + they were added to the ``synchronous_standby_names``. + :param active: set of node names that are replicating from the primary (according to ``pg_stat_replication``) + and are eligible to be listed in ``synchronous_standby_names``. + :param sync_wanted: desired number of synchronous nodes + (``synchronous_node_count`` from the global configuration). + :param leader_wanted: the desired leader (could be different from the *leader* right after a failover). + + """ + self.leader = leader + self.quorum = quorum + self.voters = CaseInsensitiveSet(voters) + self.numsync = min(numsync, len(sync)) # numsync can't be bigger than number of listed synchronous nodes. + self.sync = CaseInsensitiveSet(sync) + self.numsync_confirmed = numsync_confirmed + self.active = CaseInsensitiveSet(active) + self.sync_wanted = sync_wanted + self.leader_wanted = leader_wanted + + def check_invariants(self) -> None: + """Checks invariant of ``synchronous_standby_names`` and ``/sync`` key in DCS. + + .. seealso:: + Check :class:`QuorumStateResolver`'s docstring for more information. + + :raises: + :exc:`QuorumError`: in case of broken state""" + voters = CaseInsensitiveSet(self.voters | CaseInsensitiveSet([self.leader])) + sync = CaseInsensitiveSet(self.sync | CaseInsensitiveSet([self.leader_wanted])) + + # We need to verify that subset of nodes that can acknowledge a commit overlaps + # with any subset of nodes that can achieve quorum to promote a new leader. + # ``+ 1`` is required because the leader is included in the set. + if self.voters and not (len(voters | sync) <= self.quorum + self.numsync + 1): + len_nodes = len(voters | sync) + raise QuorumError("Quorum and sync not guaranteed to overlap: " + f"nodes {len_nodes} >= quorum {self.quorum} + sync {self.sync} + 1") + # unstable cases, we are changing synchronous_standby_names and /sync key + # one after another, hence one set is allowed to be a subset of another + if not (voters.issubset(sync) or sync.issubset(voters)): + voters_only = voters - sync + sync_only = sync - voters + raise QuorumError(f"Mismatched sets: voter only={voters_only} sync only={sync_only}") + + def quorum_update(self, quorum: int, voters: CaseInsensitiveSet, leader: Optional[str] = None, + adjust_quorum: Optional[bool] = True) -> Iterator[Transition]: + """Updates :attr:`quorum`, :attr:`voters` and optionally :attr:`leader` fields. + + :param quorum: the new value for :attr:`quorum`, could be adjusted depending + on values of :attr:`numsync_confirmed` and *adjust_quorum*. + :param voters: the new value for :attr:`voters`, could be adjusted if :attr:`numsync_confirmed` == ``0``. + :param leader: the new value for :attr:`leader`, optional. + :param adjust_quorum: if set to ``True`` the quorum requirement will be increased by the + difference between :attr:`numsync` and :attr:`numsync_confirmed`. + + :yields: the new state of the ``/sync`` key as a :class:`Transition` object. + + :raises: + :exc:`QuorumError` in case of invalid data or if the invariant after transition could not be satisfied. + """ + if quorum < 0: + raise QuorumError(f'Quorum {quorum} < 0 of ({voters})') + if quorum > 0 and quorum >= len(voters): + raise QuorumError(f'Quorum {quorum} >= N of ({voters})') + + old_leader = self.leader + if leader is not None: # Change of leader was requested + self.leader = leader + elif self.numsync_confirmed == 0: + # If there are no nodes that known to caught up with the primary we want to reset quorum/voters in /sync key + quorum = 0 + voters = CaseInsensitiveSet() + elif adjust_quorum: + # It could be that the number of nodes that are known to catch up with the primary is below desired numsync. + # We want to increase quorum to guarantee that the sync node will be found during the leader race. + quorum += max(self.numsync - self.numsync_confirmed, 0) + + if (self.leader, quorum, voters) == (old_leader, self.quorum, self.voters): + if self.voters: + return + # If transition produces no change of leader/quorum/voters we want to give a hint to + # the caller to fetch the new state from the database and restart QuorumStateResolver. + yield Transition('restart', self.leader, self.quorum, self.voters) + + self.quorum = quorum + self.voters = voters + self.check_invariants() + logger.debug('quorum %s %s %s', self.leader, self.quorum, self.voters) + yield Transition('quorum', self.leader, self.quorum, self.voters) + + def sync_update(self, numsync: int, sync: CaseInsensitiveSet) -> Iterator[Transition]: + """Updates :attr:`numsync` and :attr:`sync` fields. + + :param numsync: the new value for :attr:`numsync`. + :param sync: the new value for :attr:`sync`: + + :yields: the new state of ``synchronous_standby_names`` as a :class:`Transition` object. + + :raises: + :exc:`QuorumError` in case of invalid data or if invariant after transition could not be satisfied + """ + if numsync < 0: + raise QuorumError(f'Sync {numsync} < 0 of ({sync})') + if numsync > len(sync): + raise QuorumError(f'Sync {numsync} > N of ({sync})') + + self.numsync = numsync + self.sync = sync + self.check_invariants() + logger.debug('sync %s %s %s', self.leader, self.numsync, self.sync) + yield Transition('sync', self.leader, self.numsync, self.sync) + + def __iter__(self) -> Iterator[Transition]: + """Iterate over the transitions produced by :meth:`_generate_transitions`. + + .. note:: + Merge two transitions of the same type to a single one. + + This is always safe because skipping the first transition is equivalent + to no one observing the intermediate state. + + :yields: transitions as :class:`Transition` objects. + """ + transitions = list(self._generate_transitions()) + for cur_transition, next_transition in zip(transitions, transitions[1:] + [None]): + if isinstance(next_transition, Transition) \ + and cur_transition.transition_type == next_transition.transition_type: + continue + yield cur_transition + if cur_transition.transition_type == 'restart': + break + + def __handle_non_steady_cases(self) -> Iterator[Transition]: + """Handle cases when set of transitions produced on previous run was interrupted. + + :yields: transitions as :class:`Transition` objects. + """ + if self.sync < self.voters: + logger.debug("Case 1: synchronous_standby_names %s is a subset of DCS state %s", self.sync, self.voters) + # Case 1: voters is superset of sync nodes. In the middle of changing voters (quorum). + # Evict dead nodes from voters that are not being synced. + remove_from_voters = self.voters - (self.sync | self.active) + if remove_from_voters: + yield from self.quorum_update( + quorum=len(self.voters) - len(remove_from_voters) - self.numsync, + voters=CaseInsensitiveSet(self.voters - remove_from_voters), + adjust_quorum=not (self.sync - self.active)) + # Start syncing to nodes that are in voters and alive + add_to_sync = (self.voters & self.active) - self.sync + if add_to_sync: + yield from self.sync_update(self.numsync, CaseInsensitiveSet(self.sync | add_to_sync)) + elif self.sync > self.voters: + logger.debug("Case 2: synchronous_standby_names %s is a superset of DCS state %s", self.sync, self.voters) + # Case 2: sync is superset of voters nodes. In the middle of changing replication factor (sync). + # Add to voters nodes that are already synced and active + add_to_voters = (self.sync - self.voters) & self.active + if add_to_voters: + voters = CaseInsensitiveSet(self.voters | add_to_voters) + yield from self.quorum_update(len(voters) - self.numsync, voters) + # Remove from sync nodes that are dead + remove_from_sync = self.sync - self.voters + if remove_from_sync: + yield from self.sync_update( + numsync=min(self.numsync, len(self.sync) - len(remove_from_sync)), + sync=CaseInsensitiveSet(self.sync - remove_from_sync)) + + # After handling these two cases voters and sync must match. + assert self.voters == self.sync + + safety_margin = self.quorum + min(self.numsync, self.numsync_confirmed) - len(self.voters | self.sync) + if safety_margin > 0: # In the middle of changing replication factor. + if self.numsync > self.sync_wanted: + numsync = max(self.sync_wanted, len(self.voters) - self.quorum) + logger.debug('Case 3: replication factor %d is bigger than needed %d', self.numsync, numsync) + yield from self.sync_update(numsync, self.sync) + else: + quorum = len(self.sync) - self.numsync + logger.debug('Case 4: quorum %d is bigger than needed %d', self.quorum, quorum) + yield from self.quorum_update(quorum, self.voters) + else: + safety_margin = self.quorum + self.numsync - len(self.voters | self.sync) + if self.numsync == self.sync_wanted and safety_margin > 0 and self.numsync > self.numsync_confirmed: + yield from self.quorum_update(len(self.sync) - self.numsync, self.voters) + + def __remove_gone_nodes(self) -> Iterator[Transition]: + """Remove inactive nodes from ``synchronous_standby_names`` and from ``/sync`` key. + + :yields: transitions as :class:`Transition` objects. + """ + to_remove = self.sync - self.active + if to_remove and self.sync == to_remove: + logger.debug("Removing nodes: %s", to_remove) + yield from self.quorum_update(0, CaseInsensitiveSet(), adjust_quorum=False) + yield from self.sync_update(0, CaseInsensitiveSet()) + elif to_remove: + logger.debug("Removing nodes: %s", to_remove) + can_reduce_quorum_by = self.quorum + # If we can reduce quorum size try to do so first + if can_reduce_quorum_by: + # Pick nodes to remove by sorted order to provide deterministic behavior for tests + remove = CaseInsensitiveSet(sorted(to_remove, reverse=True)[:can_reduce_quorum_by]) + sync = CaseInsensitiveSet(self.sync - remove) + # when removing nodes from sync we can safely increase numsync if requested + numsync = min(self.sync_wanted, len(sync)) if self.sync_wanted > self.numsync else self.numsync + yield from self.sync_update(numsync, sync) + voters = CaseInsensitiveSet(self.voters - remove) + to_remove &= self.sync + yield from self.quorum_update(len(voters) - self.numsync, voters, + adjust_quorum=not to_remove) + if to_remove: + assert self.quorum == 0 + numsync = self.numsync - len(to_remove) + sync = CaseInsensitiveSet(self.sync - to_remove) + voters = CaseInsensitiveSet(self.voters - to_remove) + sync_decrease = numsync - min(self.sync_wanted, len(sync)) + quorum = min(sync_decrease, len(voters) - 1) if sync_decrease else 0 + yield from self.quorum_update(quorum, voters, adjust_quorum=False) + yield from self.sync_update(numsync, sync) + + def __add_new_nodes(self) -> Iterator[Transition]: + """Add new active nodes to ``synchronous_standby_names`` and to ``/sync`` key. + + :yields: transitions as :class:`Transition` objects. + """ + to_add = self.active - self.sync + if to_add: + # First get to requested replication factor + logger.debug("Adding nodes: %s", to_add) + sync_wanted = min(self.sync_wanted, len(self.sync | to_add)) + increase_numsync_by = sync_wanted - self.numsync + if increase_numsync_by > 0: + if self.sync: + add = CaseInsensitiveSet(sorted(to_add)[:increase_numsync_by]) + increase_numsync_by = len(add) + else: # there is only the leader + add = to_add # and it is safe to add all nodes at once if sync is empty + yield from self.sync_update(self.numsync + increase_numsync_by, CaseInsensitiveSet(self.sync | add)) + voters = CaseInsensitiveSet(self.voters | add) + yield from self.quorum_update(len(voters) - sync_wanted, voters) + to_add -= self.sync + if to_add: + voters = CaseInsensitiveSet(self.voters | to_add) + yield from self.quorum_update(len(voters) - sync_wanted, voters, + adjust_quorum=sync_wanted > self.numsync_confirmed) + yield from self.sync_update(sync_wanted, CaseInsensitiveSet(self.sync | to_add)) + + def __handle_replication_factor_change(self) -> Iterator[Transition]: + """Handle change of the replication factor (:attr:`sync_wanted`, aka ``synchronous_node_count``). + + :yields: transitions as :class:`Transition` objects. + """ + # Apply requested replication factor change + sync_increase = min(self.sync_wanted, len(self.sync)) - self.numsync + if sync_increase > 0: + # Increase replication factor + logger.debug("Increasing replication factor to %s", self.numsync + sync_increase) + yield from self.sync_update(self.numsync + sync_increase, self.sync) + yield from self.quorum_update(len(self.voters) - self.numsync, self.voters) + elif sync_increase < 0: + # Reduce replication factor + logger.debug("Reducing replication factor to %s", self.numsync + sync_increase) + if self.quorum - sync_increase < len(self.voters): + yield from self.quorum_update(len(self.voters) - self.numsync - sync_increase, self.voters, + adjust_quorum=self.sync_wanted > self.numsync_confirmed) + yield from self.sync_update(self.numsync + sync_increase, self.sync) + + def _generate_transitions(self) -> Iterator[Transition]: + """Produce a set of changes to safely transition from the current state to the desired. + + :yields: transitions as :class:`Transition` objects. + """ + logger.debug("Quorum state: leader %s quorum %s, voters %s, numsync %s, sync %s, " + "numsync_confirmed %s, active %s, sync_wanted %s leader_wanted %s", + self.leader, self.quorum, self.voters, self.numsync, self.sync, + self.numsync_confirmed, self.active, self.sync_wanted, self.leader_wanted) + try: + if self.leader_wanted != self.leader: # failover + voters = (self.voters - CaseInsensitiveSet([self.leader_wanted])) | CaseInsensitiveSet([self.leader]) + if not self.sync: + # If sync is empty we need to update synchronous_standby_names first + numsync = len(voters) - self.quorum + yield from self.sync_update(numsync, CaseInsensitiveSet(voters)) + # If leader changed we need to add the old leader to quorum (voters) + yield from self.quorum_update(self.quorum, CaseInsensitiveSet(voters), self.leader_wanted) + # right after promote there could be no replication connections yet + if not self.sync & self.active: + return # give another loop_wait seconds for replicas to reconnect before removing them from quorum + else: + self.check_invariants() + except QuorumError as e: + logger.warning('%s', e) + yield from self.quorum_update(len(self.sync) - self.numsync, self.sync) + + assert self.leader == self.leader_wanted + + # numsync_confirmed could be 0 after restart/failover, we will calculate it from quorum + if self.numsync_confirmed == 0 and self.sync & self.active: + self.numsync_confirmed = min(len(self.sync & self.active), len(self.voters) - self.quorum) + logger.debug('numsync_confirmed=0, adjusting it to %d', self.numsync_confirmed) + + yield from self.__handle_non_steady_cases() + + # We are in a steady state point. Find if desired state is different and act accordingly. + + yield from self.__remove_gone_nodes() + + yield from self.__add_new_nodes() + + yield from self.__handle_replication_factor_change() diff --git a/patroni/raft_controller.py b/patroni/raft_controller.py index a9d7424c0..10845d244 100644 --- a/patroni/raft_controller.py +++ b/patroni/raft_controller.py @@ -1,7 +1,7 @@ import logging from .config import Config -from .daemon import AbstractPatroniDaemon, abstract_main, get_base_arg_parser +from .daemon import abstract_main, AbstractPatroniDaemon, get_base_arg_parser from .dcs.raft import KVStoreTTL logger = logging.getLogger(__name__) diff --git a/patroni/request.py b/patroni/request.py index 16659c96a..2bba134b1 100644 --- a/patroni/request.py +++ b/patroni/request.py @@ -1,11 +1,11 @@ """Facilities for handling communication with Patroni's REST API.""" import json -import urllib3 from typing import Any, Dict, Optional, Union - from urllib.parse import urlparse, urlunparse +import urllib3 + from .config import Config from .dcs import Member from .utils import USER_AGENT diff --git a/patroni/scripts/aws.py b/patroni/scripts/aws.py index 9fcc81601..d987f7859 100755 --- a/patroni/scripts/aws.py +++ b/patroni/scripts/aws.py @@ -3,13 +3,14 @@ import json import logging import sys + +from typing import Any, Optional + import boto3 from botocore.exceptions import ClientError from botocore.utils import IMDSFetcher -from typing import Any, Optional - from ..utils import Retry, RetryFailedError logger = logging.getLogger(__name__) diff --git a/patroni/scripts/barman/cli.py b/patroni/scripts/barman/cli.py index fcec14e69..148947a6b 100644 --- a/patroni/scripts/barman/cli.py +++ b/patroni/scripts/barman/cli.py @@ -9,11 +9,12 @@ See :class:ExitCode` for possible exit codes of this main script. """ -from argparse import ArgumentParser -from enum import IntEnum import logging import sys +from argparse import ArgumentParser +from enum import IntEnum + from .config_switch import run_barman_config_switch from .recover import run_barman_recover from .utils import ApiNotOk, PgBackupApi, set_up_logging diff --git a/patroni/scripts/barman/config_switch.py b/patroni/scripts/barman/config_switch.py index f65dc3f13..c45401086 100644 --- a/patroni/scripts/barman/config_switch.py +++ b/patroni/scripts/barman/config_switch.py @@ -14,15 +14,15 @@ Refer to :class:`ExitCode` for possible exit codes of this sub-command. """ -from argparse import Namespace -from enum import IntEnum import logging import time + +from argparse import Namespace +from enum import IntEnum from typing import Optional, TYPE_CHECKING from .utils import OperationStatus, RetriesExceeded - if TYPE_CHECKING: # pragma: no cover from .utils import PgBackupApi diff --git a/patroni/scripts/barman/recover.py b/patroni/scripts/barman/recover.py index 5a07a7a5c..a4d12b8a2 100644 --- a/patroni/scripts/barman/recover.py +++ b/patroni/scripts/barman/recover.py @@ -14,15 +14,15 @@ Refer to :class:`ExitCode` for possible exit codes of this sub-command. """ -from argparse import Namespace -from enum import IntEnum import logging import time + +from argparse import Namespace +from enum import IntEnum from typing import TYPE_CHECKING from .utils import OperationStatus, RetriesExceeded - if TYPE_CHECKING: # pragma: no cover from .utils import PgBackupApi diff --git a/patroni/scripts/barman/utils.py b/patroni/scripts/barman/utils.py index 96b68bbf2..6145d9fad 100644 --- a/patroni/scripts/barman/utils.py +++ b/patroni/scripts/barman/utils.py @@ -2,11 +2,12 @@ """Utilitary stuff to be used by Barman related scripts.""" -from enum import IntEnum import json import logging -from typing import Any, Callable, Dict, Optional, Tuple, Type, Union import time + +from enum import IntEnum +from typing import Any, Callable, Dict, Optional, Tuple, Type, Union from urllib.parse import urljoin from urllib3 import PoolManager diff --git a/patroni/tags.py b/patroni/tags.py index e1e68f267..844a17617 100644 --- a/patroni/tags.py +++ b/patroni/tags.py @@ -3,7 +3,7 @@ from typing import Any, Dict, Optional -from patroni.utils import parse_int, parse_bool +from patroni.utils import parse_bool, parse_int class Tags(abc.ABC): diff --git a/patroni/utils.py b/patroni/utils.py index 08a1f3cb3..021fa416a 100644 --- a/patroni/utils.py +++ b/patroni/utils.py @@ -21,13 +21,13 @@ import sys import tempfile import time -from shlex import split - -from typing import Any, Callable, Dict, Iterator, List, Optional, Union, Tuple, Type, TYPE_CHECKING from collections import OrderedDict -from dateutil import tz from json import JSONDecoder +from shlex import split +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING, Union + +from dateutil import tz from urllib3.response import HTTPResponse from .exceptions import PatroniException @@ -922,7 +922,7 @@ def cluster_as_json(cluster: 'Cluster') -> Dict[str, Any]: * ``members``: list of members in the cluster. Each value is a :class:`dict` that may have the following keys: * ``name``: the name of the host (unique in the cluster). The ``members`` list is sorted by this key; - * ``role``: ``leader``, ``standby_leader``, ``sync_standby``, or ``replica``; + * ``role``: ``leader``, ``standby_leader``, ``sync_standby``, ``quorum_standby``, or ``replica``; * ``state``: ``stopping``, ``stopped``, ``stop failed``, ``crashed``, ``running``, ``starting``, ``start failed``, ``restarting``, ``restart failed``, ``initializing new cluster``, ``initdb failed``, ``running custom bootstrap script``, ``custom bootstrap failed``, or ``creating replica``; @@ -949,11 +949,12 @@ def cluster_as_json(cluster: 'Cluster') -> Dict[str, Any]: cluster_lsn = cluster.status.last_lsn ret: Dict[str, Any] = {'members': []} + sync_role = 'quorum_standby' if config.is_quorum_commit_mode else 'sync_standby' for m in cluster.members: if m.name == leader_name: role = 'standby_leader' if config.is_standby_cluster else 'leader' elif config.is_synchronous_mode and cluster.sync.matches(m.name): - role = 'sync_standby' + role = sync_role else: role = 'replica' diff --git a/patroni/validator.py b/patroni/validator.py index 233b05b72..c1438d38c 100644 --- a/patroni/validator.py +++ b/patroni/validator.py @@ -9,13 +9,13 @@ import shutil import socket -from typing import Any, Dict, Union, Iterator, List, Optional as OptionalType, Tuple, TYPE_CHECKING +from typing import Any, Dict, Iterator, List, Optional as OptionalType, Tuple, TYPE_CHECKING, Union from .collections import CaseInsensitiveSet, EMPTY_DICT from .dcs import dcs_modules from .exceptions import ConfigParseError -from .utils import parse_int, split_host_port, data_directory_is_empty, get_major_version from .log import type_logformat +from .utils import data_directory_is_empty, get_major_version, parse_int, split_host_port def validate_log_field(field: Union[str, Dict[str, Any], Any]) -> bool: @@ -991,6 +991,7 @@ def validate_watchdog_mode(value: Any) -> None: Optional("dir"): str, Optional("file_num"): int, Optional("file_size"): int, + Optional("mode"): IntValidator(min=0, max=511, expected_type=int, raise_assert=True), Optional("loggers"): dict }, Optional("ctl"): { diff --git a/patroni/watchdog/__init__.py b/patroni/watchdog/__init__.py index 4acc855e8..77a532631 100644 --- a/patroni/watchdog/__init__.py +++ b/patroni/watchdog/__init__.py @@ -1,2 +1,3 @@ -from patroni.watchdog.base import WatchdogError, Watchdog +from patroni.watchdog.base import Watchdog, WatchdogError + __all__ = ['WatchdogError', 'Watchdog'] diff --git a/patroni/watchdog/base.py b/patroni/watchdog/base.py index 7d1cdca67..ad80dfe55 100644 --- a/patroni/watchdog/base.py +++ b/patroni/watchdog/base.py @@ -2,8 +2,8 @@ import logging import platform import sys -from threading import RLock +from threading import RLock from typing import Any, Callable, Dict, Optional, Union from ..config import Config diff --git a/setup.py b/setup.py index b8b8a062c..0d69a8403 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ Setup file for patroni """ +import glob import inspect import logging import os @@ -47,6 +48,7 @@ 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: Implementation :: CPython', ] @@ -68,33 +70,67 @@ def finalize_options(self): pass -class Flake8(_Command): +class _Lint(_Command): - def package_files(self): - seen_package_directories = () - directories = self.distribution.package_dir or {} - empty_directory_exists = "" in directories - packages = self.distribution.packages or [] - for package in packages: - if package in directories: - package_directory = directories[package] - elif empty_directory_exists: - package_directory = os.path.join(directories[""], package) + def package_modules(self): + package_dirs = self.distribution.package_dir or {} + for package in self.distribution.packages or []: + if package in package_dirs: + yield package_dirs[package] + elif '' in package_dirs: + yield os.path.join(package_dirs[''], package) else: - package_directory = package + yield package - if not package_directory.startswith(seen_package_directories): - seen_package_directories += (package_directory + ".",) - yield package_directory + def package_directories(self): + for module in self.package_modules(): + yield module.replace('.', os.path.sep) - def targets(self): - return [package for package in self.package_files()] + ['tests', 'features', 'setup.py'] + def aux_directories(self): + for dir_name in ('tests', 'features'): + for root, dirs, files in os.walk(dir_name): + for name in dirs: + yield os.path.join(root, name) + + def dirs_to_check(self): + yield from self.package_directories() + yield from self.aux_directories() + + def files_to_check(self): + for path in self.dirs_to_check(): + for python_file in glob.iglob(os.path.join(path, '*.py')): + yield python_file + + for filename in self.distribution.py_modules or []: + yield f'{filename}.py' + + yield 'setup.py' + + +class Flake8(_Lint): def run(self): from flake8.main.cli import main logging.getLogger().setLevel(logging.ERROR) - raise SystemExit(main(self.targets())) + raise SystemExit(main(list(self.files_to_check()))) + + +class ISort(_Lint): + + def run(self): + from isort import api + + wrong_sorted_files = False + for python_file in self.files_to_check(): + try: + if not api.check_file(python_file, settings_path=__location__, show_diff=True): + wrong_sorted_files = True + except OSError as error: + logging.warning('Unable to parse file %s due to %r', python_file, error) + wrong_sorted_files = True + if wrong_sorted_files: + sys.exit(1) class PyTest(_Command): @@ -177,7 +213,7 @@ def main(): ]}, install_requires=install_requires, extras_require=EXTRAS_REQUIRE, - cmdclass={'test': PyTest, 'flake8': Flake8}, + cmdclass={'test': PyTest, 'flake8': Flake8, 'isort': ISort}, entry_points={'console_scripts': CONSOLE_SCRIPTS}, ) diff --git a/tests/__init__.py b/tests/__init__.py index 8f531baed..66659deee 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,7 +2,8 @@ import os import shutil import unittest -from unittest.mock import Mock, PropertyMock, patch + +from unittest.mock import Mock, patch, PropertyMock import urllib3 @@ -178,8 +179,12 @@ def execute(self, sql, *params): b'3\t0/403DD98\tno recovery target specified\n')] elif sql.startswith('SELECT pg_catalog.citus_add_node'): self.results = [(2,)] - elif sql.startswith('SELECT nodeid, groupid'): - self.results = [(1, 0, 'host1', 5432, 'primary'), (2, 1, 'host2', 5432, 'primary')] + elif sql.startswith('SELECT groupid, nodename'): + self.results = [(0, 'host1', 5432, 'primary', 1), + (0, '127.0.0.1', 5436, 'secondary', 2), + (1, 'host4', 5432, 'primary', 3), + (1, '127.0.0.1', 5437, 'secondary', 4), + (1, '127.0.0.1', 5438, 'secondary', 5)] else: self.results = [(None, None, None, None, None, None, None, None, None, None)] self.rowcount = len(self.results) diff --git a/tests/test_api.py b/tests/test_api.py index e4e9676a3..55e0016bc 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,12 +1,12 @@ import datetime import json -import unittest import socket +import unittest from http.server import HTTPServer from io import BytesIO as IO from socketserver import ThreadingMixIn -from unittest.mock import Mock, PropertyMock, patch +from unittest.mock import Mock, patch, PropertyMock from patroni import global_config from patroni.api import RestApiHandler, RestApiServer @@ -20,7 +20,6 @@ from . import MockConnect, psycopg_connect from .test_ha import get_cluster_initialized_without_leader - future_restart_time = datetime.datetime.now(tzutc) + datetime.timedelta(days=5) postmaster_start_time = datetime.datetime.now(tzutc) @@ -205,7 +204,6 @@ class TestRestApiHandler(unittest.TestCase): def test_do_GET(self): MockPostgresql.pending_restart_reason = {'max_connections': get_param_diff('200', '100')} MockPatroni.dcs.cluster.status.last_lsn = 20 - MockPatroni.dcs.cluster.sync.members = [MockPostgresql.name] with patch.object(global_config.__class__, 'is_synchronous_mode', PropertyMock(return_value=True)): MockRestApiServer(RestApiHandler, 'GET /replica') MockRestApiServer(RestApiHandler, 'GET /replica?lag=1M') @@ -223,12 +221,16 @@ def test_do_GET(self): Mock(return_value={'role': 'replica', 'sync_standby': True})): MockRestApiServer(RestApiHandler, 'GET /synchronous') MockRestApiServer(RestApiHandler, 'GET /read-only-sync') + with patch.object(RestApiHandler, 'get_postgresql_status', + Mock(return_value={'role': 'replica', 'quorum_standby': True})): + MockRestApiServer(RestApiHandler, 'GET /quorum') + MockRestApiServer(RestApiHandler, 'GET /read-only-quorum') with patch.object(RestApiHandler, 'get_postgresql_status', Mock(return_value={'role': 'replica'})): - MockPatroni.dcs.cluster.sync.members = [] MockRestApiServer(RestApiHandler, 'GET /asynchronous') with patch.object(MockHa, 'is_leader', Mock(return_value=True)): MockRestApiServer(RestApiHandler, 'GET /replica') MockRestApiServer(RestApiHandler, 'GET /read-only-sync') + MockRestApiServer(RestApiHandler, 'GET /read-only-quorum') with patch.object(global_config.__class__, 'is_standby_cluster', Mock(return_value=True)): MockRestApiServer(RestApiHandler, 'GET /standby_leader') MockPatroni.dcs.cluster = None diff --git a/tests/test_async_executor.py b/tests/test_async_executor.py index 08a919175..853d685ac 100644 --- a/tests/test_async_executor.py +++ b/tests/test_async_executor.py @@ -1,8 +1,9 @@ import unittest + +from threading import Thread from unittest.mock import Mock, patch from patroni.async_executor import AsyncExecutor, CriticalTask -from threading import Thread class TestAsyncExecutor(unittest.TestCase): diff --git a/tests/test_aws.py b/tests/test_aws.py index 69ca3285f..f63f188cf 100644 --- a/tests/test_aws.py +++ b/tests/test_aws.py @@ -1,10 +1,12 @@ -import botocore -import botocore.awsrequest import sys import unittest -from unittest.mock import Mock, PropertyMock, patch from collections import namedtuple +from unittest.mock import Mock, patch, PropertyMock + +import botocore +import botocore.awsrequest + from patroni.scripts.aws import AWSConnection, main as _main diff --git a/tests/test_barman.py b/tests/test_barman.py index eda9cdda3..260f4c4d9 100644 --- a/tests/test_barman.py +++ b/tests/test_barman.py @@ -1,16 +1,17 @@ import logging import unittest + from unittest import mock from unittest.mock import MagicMock, Mock, patch + from urllib3.exceptions import MaxRetryError from patroni.scripts.barman.cli import main -from patroni.scripts.barman.config_switch import (ExitCode as BarmanConfigSwitchExitCode, _should_skip_switch, - _switch_config, run_barman_config_switch) -from patroni.scripts.barman.recover import ExitCode as BarmanRecoverExitCode, _restore_backup, run_barman_recover +from patroni.scripts.barman.config_switch import _should_skip_switch, _switch_config, \ + ExitCode as BarmanConfigSwitchExitCode, run_barman_config_switch +from patroni.scripts.barman.recover import _restore_backup, ExitCode as BarmanRecoverExitCode, run_barman_recover from patroni.scripts.barman.utils import ApiNotOk, OperationStatus, PgBackupApi, RetriesExceeded, set_up_logging - API_URL = "http://localhost:7480" BARMAN_SERVER = "my_server" BARMAN_MODEL = "my_model" diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py index 37725ae5d..6371f6b1c 100644 --- a/tests/test_bootstrap.py +++ b/tests/test_bootstrap.py @@ -1,6 +1,7 @@ import os import sys -from unittest.mock import Mock, PropertyMock, patch + +from unittest.mock import Mock, patch, PropertyMock from patroni.async_executor import CriticalTask from patroni.collections import CaseInsensitiveDict @@ -9,7 +10,7 @@ from patroni.postgresql.cancellable import CancellableSubprocess from patroni.postgresql.config import ConfigHandler, get_param_diff -from . import psycopg_connect, BaseTestPostgresql, mock_available_gucs +from . import BaseTestPostgresql, mock_available_gucs, psycopg_connect @patch('subprocess.call', Mock(return_value=0)) diff --git a/tests/test_callback_executor.py b/tests/test_callback_executor.py index 6ec53a3d5..e76c96578 100644 --- a/tests/test_callback_executor.py +++ b/tests/test_callback_executor.py @@ -1,7 +1,9 @@ -import psutil import unittest + from unittest.mock import Mock, patch +import psutil + from patroni.postgresql.callback_executor import CallbackExecutor diff --git a/tests/test_cancellable.py b/tests/test_cancellable.py index 23510cf91..41568ba72 100644 --- a/tests/test_cancellable.py +++ b/tests/test_cancellable.py @@ -1,7 +1,9 @@ -import psutil import unittest + from unittest.mock import Mock, patch +import psutil + from patroni.exceptions import PostgresException from patroni.postgresql.cancellable import CancellableSubprocess diff --git a/tests/test_citus.py b/tests/test_citus.py index bbda76724..63bbf3308 100644 --- a/tests/test_citus.py +++ b/tests/test_citus.py @@ -1,7 +1,11 @@ import time +import unittest + +from copy import deepcopy +from typing import List from unittest.mock import Mock, patch, PropertyMock -from patroni.postgresql.mpp.citus import CitusHandler +from patroni.postgresql.mpp.citus import CitusHandler, PgDistGroup, PgDistNode from patroni.psycopg import ProgrammingError from . import BaseTestPostgresql, MockCursor, psycopg_connect, SleepException @@ -21,7 +25,7 @@ def setUp(self): @patch('time.time', Mock(side_effect=[100, 130, 160, 190, 220, 250, 280, 310, 340, 370])) @patch('patroni.postgresql.mpp.citus.logger.exception', Mock(side_effect=SleepException)) @patch('patroni.postgresql.mpp.citus.logger.warning') - @patch('patroni.postgresql.mpp.citus.PgDistNode.wait', Mock()) + @patch('patroni.postgresql.mpp.citus.PgDistTask.wait', Mock()) @patch.object(CitusHandler, 'is_alive', Mock(return_value=True)) def test_run(self, mock_logger_warning): # `before_demote` or `before_promote` REST API calls starting a @@ -33,11 +37,11 @@ def test_run(self, mock_logger_warning): self.c.handle_event(self.cluster, {'type': 'before_demote', 'group': 1, 'leader': 'leader', 'timeout': 30, 'cooldown': 10}) - self.c.add_task('after_promote', 2, 'postgres://host3:5432/postgres') + self.c.add_task('after_promote', 2, self.cluster, self.cluster.leader_name, 'postgres://host3:5432/postgres') self.assertRaises(SleepException, self.c.run) mock_logger_warning.assert_called_once() self.assertTrue(mock_logger_warning.call_args[0][0].startswith('Rolling back transaction')) - self.assertTrue(repr(mock_logger_warning.call_args[0][1]).startswith('PgDistNode')) + self.assertTrue(repr(mock_logger_warning.call_args[0][1]).startswith('PgDistTask')) @patch.object(CitusHandler, 'is_alive', Mock(return_value=False)) @patch.object(CitusHandler, 'start', Mock()) @@ -55,59 +59,68 @@ def test_handle_event(self): def test_add_task(self): with patch('patroni.postgresql.mpp.citus.logger.error') as mock_logger, \ patch('patroni.postgresql.mpp.citus.urlparse', Mock(side_effect=Exception)): - self.c.add_task('', 1, None) + self.c.add_task('', 1, self.cluster, '', None) mock_logger.assert_called_once() with patch('patroni.postgresql.mpp.citus.logger.debug') as mock_logger: - self.c.add_task('before_demote', 1, 'postgres://host:5432/postgres', 30) + self.c.add_task('before_demote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres', 30) mock_logger.assert_called_once() self.assertTrue(mock_logger.call_args[0][0].startswith('Adding the new task:')) with patch('patroni.postgresql.mpp.citus.logger.debug') as mock_logger: - self.c.add_task('before_promote', 1, 'postgres://host:5432/postgres', 30) + self.c.add_task('before_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres', 30) mock_logger.assert_called_once() self.assertTrue(mock_logger.call_args[0][0].startswith('Overriding existing task:')) - # add_task called from sync_meta_data should not override already scheduled or in flight task until deadline - self.assertIsNotNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres', 30)) - self.assertIsNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + # add_task called from sync_pg_dist_node should not override already scheduled or in flight task until deadline + self.assertIsNotNone(self.c.add_task('after_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres', 30)) + self.assertIsNone(self.c.add_task('after_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres')) self.c._in_flight = self.c._tasks.pop() self.c._in_flight.deadline = self.c._in_flight.timeout + time.time() - self.assertIsNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + self.assertIsNone(self.c.add_task('after_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres')) self.c._in_flight.deadline = 0 - self.assertIsNotNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + self.assertIsNotNone(self.c.add_task('after_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres')) # If there is no transaction in progress and cached pg_dist_node matching desired state task should not be added self.c._schedule_load_pg_dist_node = False - self.c._pg_dist_node[self.c._in_flight.group] = self.c._in_flight + self.c._pg_dist_group[self.c._in_flight.groupid] = self.c._in_flight self.c._in_flight = None - self.assertIsNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + self.assertIsNone(self.c.add_task('after_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host:5432/postgres')) def test_pick_task(self): - self.c.add_task('after_promote', 1, 'postgres://host2:5432/postgres') - with patch.object(CitusHandler, 'process_task') as mock_process_task: + self.c.add_task('after_promote', 0, self.cluster, self.cluster.leader_name, 'postgres://host1:5432/postgres') + with patch.object(CitusHandler, 'update_node') as mock_update_node: self.c.process_tasks() - # process_task() shouln't be called because pick_task double checks with _pg_dist_node - mock_process_task.assert_not_called() + # process_task() shouln't be called because pick_task double checks with _pg_dist_group + mock_update_node.assert_not_called() def test_process_task(self): - self.c.add_task('after_promote', 0, 'postgres://host2:5432/postgres') - task = self.c.add_task('before_promote', 1, 'postgres://host4:5432/postgres', 30) + self.c.add_task('after_promote', 1, self.cluster, self.cluster.leader_name, 'postgres://host2:5432/postgres') + task = self.c.add_task('before_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host4:5432/postgres', 30) self.c.process_tasks() self.assertTrue(task._event.is_set()) # the after_promote should result only in COMMIT - task = self.c.add_task('after_promote', 1, 'postgres://host4:5432/postgres', 30) + task = self.c.add_task('after_promote', 1, self.cluster, + self.cluster.leader_name, 'postgres://host4:5432/postgres', 30) with patch.object(CitusHandler, 'query') as mock_query: self.c.process_tasks() mock_query.assert_called_once() self.assertEqual(mock_query.call_args[0][0], 'COMMIT') def test_process_tasks(self): - self.c.add_task('after_promote', 0, 'postgres://host2:5432/postgres') + self.c.add_task('after_promote', 0, self.cluster, self.cluster.leader_name, 'postgres://host2:5432/postgres') self.c.process_tasks() - self.c.add_task('after_promote', 0, 'postgres://host3:5432/postgres') + self.c.add_task('after_promote', 0, self.cluster, self.cluster.leader_name, 'postgres://host3:5432/postgres') with patch('patroni.postgresql.mpp.citus.logger.error') as mock_logger, \ patch.object(CitusHandler, 'query', Mock(side_effect=Exception)): self.c.process_tasks() @@ -119,16 +132,17 @@ def test_on_demote(self): @patch('patroni.postgresql.mpp.citus.logger.error') @patch.object(MockCursor, 'execute', Mock(side_effect=Exception)) - def test_load_pg_dist_node(self, mock_logger): - # load_pg_dist_node() triggers, query fails and exception is property handled + def test_load_pg_dist_group(self, mock_logger): + # load_pg_dist_group) triggers, query fails and exception is property handled self.c.process_tasks() - self.assertTrue(self.c._schedule_load_pg_dist_node) + self.assertTrue(self.c._schedule_load_pg_dist_group) mock_logger.assert_called_once() self.assertTrue(mock_logger.call_args[0][0].startswith('Exception when executing query')) - self.assertTrue(mock_logger.call_args[0][1].startswith('SELECT nodeid, groupid, ')) + self.assertTrue(mock_logger.call_args[0][1].startswith('SELECT groupid, nodename, ')) def test_wait(self): - task = self.c.add_task('before_demote', 1, 'postgres://host:5432/postgres', 30) + task = self.c.add_task('before_demote', 1, self.cluster, + self.cluster.leader_name, u'postgres://host:5432/postgres', 30) task._event.wait = Mock() task.wait() @@ -172,3 +186,209 @@ def test_bootstrap_duplicate_database(self, mock_logger): self.c.bootstrap() mock_logger.assert_called_once() self.assertTrue(mock_logger.call_args[0][0].startswith('Exception when creating database')) + + +class TestGroupTransition(unittest.TestCase): + nodeid = 100 + + def map_to_sql(self, group: int, transition: PgDistNode) -> str: + if transition.role not in ('primary', 'demoted', 'secondary'): + return "citus_remove_node('{0}', {1})".format(transition.host, transition.port) + elif transition.nodeid: + host = transition.host + ('-demoted' if transition.role == 'demoted' else '') + return "citus_update_node({0}, '{1}', {2})".format(transition.nodeid, host, transition.port) + else: + transition.nodeid = self.nodeid + self.nodeid += 1 + + return "citus_add_node('{0}', {1}, {2}, '{3}')".format(transition.host, transition.port, + group, transition.role) + + def check_transitions(self, old_topology: PgDistGroup, new_topology: PgDistGroup, + expected_transitions: List[str]) -> None: + check_topology = deepcopy(old_topology) + + transitions: List[str] = [] + for node in new_topology.transition(old_topology): + self.assertTrue(node not in check_topology or (check_topology.get(node) or node).role == 'demoted') + old_node = node.nodeid and next(iter(v for v in check_topology if v.nodeid == node.nodeid), None) + if old_node: + check_topology.discard(old_node) + transitions.append(self.map_to_sql(new_topology.groupid, node)) + check_topology.add(node) + self.assertEqual(transitions, expected_transitions) + + def test_new_topology(self): + old = PgDistGroup(0) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'primary'), + PgDistNode('2', 5432, 'secondary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=100), + PgDistNode('2', 5432, 'secondary', nodeid=101)}) + self.check_transitions(old, new, + ["citus_add_node('1', 5432, 0, 'primary')", + "citus_add_node('2', 5432, 0, 'secondary')"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary'), + PgDistNode('2', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('2', 5432, 'primary', nodeid=1), + PgDistNode('1', 5432, 'secondary', nodeid=2)}) + self.check_transitions(old, new, + ["citus_update_node(1, '1-demoted', 5432)", + "citus_update_node(2, '1', 5432)", + "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_failover(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('2', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('2', 5432, 'primary', nodeid=1), + PgDistNode('1', 5432, 'secondary', nodeid=2)}) + self.check_transitions(old, new, + ["citus_update_node(1, '1-demoted', 5432)", + "citus_update_node(2, '1', 5432)", + "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_failover_and_new_secondary(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('2', 5432, 'primary'), + PgDistNode('3', 5432, 'secondary')}) + expected = PgDistGroup(0, {PgDistNode('2', 5432, 'primary', nodeid=1), + PgDistNode('3', 5432, 'secondary', nodeid=2)}) + # the secondary record is used to add the new standby and primary record is updated with the new hostname + self.check_transitions(old, new, ["citus_update_node(2, '3', 5432)", "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_and_new_secondary_primary_gone(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'demoted', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('2', 5432, 'primary'), + PgDistNode('3', 5432, 'secondary')}) + expected = PgDistGroup(0, {PgDistNode('2', 5432, 'primary', nodeid=1), + PgDistNode('3', 5432, 'secondary', nodeid=2)}) + # the secondary record is used to add the new standby and primary record is updated with the new hostname + self.check_transitions(old, new, ["citus_update_node(2, '3', 5432)", "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_secondary_replaced(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'primary'), + PgDistNode('3', 5432, 'secondary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('3', 5432, 'secondary', nodeid=2)}) + self.check_transitions(old, new, ["citus_update_node(2, '3', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_secondary_repmoved(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2), + PgDistNode('3', 5432, 'secondary', nodeid=3)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'primary'), + PgDistNode('3', 5432, 'secondary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('3', 5432, 'secondary', nodeid=3)}) + self.check_transitions(old, new, ["citus_remove_node('2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_and_secondary_removed(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2), + PgDistNode('3', 5432, 'secondary', nodeid=3)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary'), + PgDistNode('2', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary', nodeid=2), + PgDistNode('2', 5432, 'primary', nodeid=1), + PgDistNode('3', 5432, 'secondary', nodeid=3)}) + self.check_transitions(old, new, + ["citus_update_node(1, '1-demoted', 5432)", + "citus_update_node(2, '1', 5432)", + "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_and_new_secondary(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary'), + PgDistNode('2', 5432, 'primary'), + PgDistNode('3', 5432, 'secondary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary', nodeid=2), + PgDistNode('2', 5432, 'primary', nodeid=1)}) + self.check_transitions(old, new, + ["citus_update_node(1, '1-demoted', 5432)", + "citus_update_node(2, '1', 5432)", + "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_failover_to_new_node_secondary_remains(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('2', 5432, 'secondary'), + PgDistNode('3', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('3', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + self.check_transitions(old, new, ["citus_update_node(1, '3', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_failover_to_new_node_secondary_removed(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('3', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('3', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + # the secondary record needs to be removed before we update the primary record + self.check_transitions(old, new, ["citus_update_node(1, '3', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_to_new_node_and_secondary_removed(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary'), + PgDistNode('3', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('3', 5432, 'primary', nodeid=1), + PgDistNode('1', 5432, 'secondary', nodeid=2)}) + self.check_transitions(old, new, ["citus_update_node(1, '3', 5432)", "citus_update_node(2, '1', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_with_pause(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'primary', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('1', 5432, 'demoted')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'demoted', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + self.check_transitions(old, new, ["citus_update_node(1, '1-demoted', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_after_paused_connections(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'demoted', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('2', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary', nodeid=2), + PgDistNode('2', 5432, 'primary', nodeid=1)}) + self.check_transitions(old, new, ["citus_update_node(2, '1', 5432)", "citus_update_node(1, '2', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_to_new_node_after_paused_connections(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'demoted', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('3', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('1', 5432, 'secondary', nodeid=2), + PgDistNode('3', 5432, 'primary', nodeid=1)}) + self.check_transitions(old, new, ["citus_update_node(1, '3', 5432)", "citus_update_node(2, '1', 5432)"]) + self.assertTrue(new.equals(expected, True)) + + def test_switchover_to_new_node_after_paused_connections_secondary_added(self): + old = PgDistGroup(0, {PgDistNode('1', 5432, 'demoted', nodeid=1), + PgDistNode('2', 5432, 'secondary', nodeid=2)}) + new = PgDistGroup(0, {PgDistNode('4', 5432, 'secondary'), + PgDistNode('3', 5432, 'primary')}) + expected = PgDistGroup(0, {PgDistNode('4', 5432, 'secondary', nodeid=2), + PgDistNode('3', 5432, 'primary', nodeid=1)}) + self.check_transitions(old, new, ["citus_update_node(1, '3', 5432)", "citus_update_node(2, '4', 5432)"]) + self.assertTrue(new.equals(expected, True)) diff --git a/tests/test_config.py b/tests/test_config.py index 6cad071dc..ef4ef1aef 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,7 @@ +import io import os import sys import unittest -import io from copy import deepcopy from unittest.mock import MagicMock, Mock, patch @@ -38,6 +38,7 @@ def test_reload_local_configuration(self): 'PATRONI_LOG_FORMAT': '["message", {"levelname": "level"}]', 'PATRONI_LOG_LOGGERS': 'patroni.postmaster: WARNING, urllib3: DEBUG', 'PATRONI_LOG_FILE_NUM': '5', + 'PATRONI_LOG_MODE': '0123', 'PATRONI_CITUS_DATABASE': 'citus', 'PATRONI_CITUS_GROUP': '0', 'PATRONI_CITUS_HOST': '0', @@ -81,6 +82,7 @@ def test_reload_local_configuration(self): 'PATRONI_POSTGRESQL_BIN_POSTGRES': 'sergtsop' }) config = Config('postgres0.yml') + self.assertEqual(config.local_configuration['log']['mode'], 0o123) with patch.object(Config, '_load_config_file', Mock(return_value={'restapi': {}})): with patch.object(Config, '_build_effective_configuration', Mock(side_effect=Exception)): config.reload_local_configuration() diff --git a/tests/test_config_generator.py b/tests/test_config_generator.py index d40cee1f2..ffba14ee7 100644 --- a/tests/test_config_generator.py +++ b/tests/test_config_generator.py @@ -1,19 +1,19 @@ import os -import psutil import unittest -import yaml -from . import MockConnect, MockCursor from copy import deepcopy -from unittest.mock import MagicMock, Mock, PropertyMock, mock_open as _mock_open, patch +from unittest.mock import MagicMock, Mock, mock_open as _mock_open, patch, PropertyMock + +import psutil +import yaml from patroni.__main__ import main as _main from patroni.config import Config from patroni.config_generator import AbstractConfigGenerator, get_address, NO_VALUE_MSG from patroni.log import PatroniLogger -from patroni.utils import patch_config, parse_bool +from patroni.utils import parse_bool, patch_config -from . import psycopg_connect +from . import MockConnect, MockCursor, psycopg_connect HOSTNAME = 'test_hostname' IP = '1.9.8.4' diff --git a/tests/test_consul.py b/tests/test_consul.py index 49ec3118a..d7288c52f 100644 --- a/tests/test_consul.py +++ b/tests/test_consul.py @@ -1,12 +1,16 @@ -import consul import unittest -from unittest.mock import Mock, PropertyMock, patch + +from unittest.mock import Mock, patch, PropertyMock + +import consul from consul import ConsulException, NotFound + from patroni.dcs import get_dcs -from patroni.dcs.consul import AbstractDCS, Cluster, Consul, ConsulInternalError, \ - ConsulError, ConsulClient, HTTPClient, InvalidSessionTTL, InvalidSession, RetryFailedError +from patroni.dcs.consul import AbstractDCS, Cluster, Consul, ConsulClient, ConsulError, \ + ConsulInternalError, HTTPClient, InvalidSession, InvalidSessionTTL, RetryFailedError from patroni.postgresql.mpp import get_mpp + from . import SleepException diff --git a/tests/test_ctl.py b/tests/test_ctl.py index d5d83740e..f126e939d 100644 --- a/tests/test_ctl.py +++ b/tests/test_ctl.py @@ -1,28 +1,31 @@ -import click -import etcd import os import unittest -from click.testing import CliRunner from datetime import datetime, timedelta +from unittest import mock +from unittest.mock import Mock, patch, PropertyMock + +import click +import etcd + +from click.testing import CliRunner +from prettytable import ALL, PrettyTable +from urllib3 import PoolManager + from patroni import global_config -from patroni.ctl import ctl, load_config, output_members, get_dcs, parse_dcs, \ - get_all_members, get_any_member, get_cursor, query_member, PatroniCtlException, apply_config_changes, \ - format_config_for_editing, show_diff, invoke_editor, format_pg_version, CONFIG_FILE_PATH, PatronictlPrettyTable +from patroni.ctl import apply_config_changes, CONFIG_FILE_PATH, ctl, format_config_for_editing, \ + format_pg_version, get_all_members, get_any_member, get_cursor, get_dcs, invoke_editor, load_config, \ + output_members, parse_dcs, PatroniCtlException, PatronictlPrettyTable, query_member, show_diff from patroni.dcs import Cluster, Failover from patroni.postgresql.config import get_param_diff from patroni.postgresql.mpp import get_mpp from patroni.psycopg import OperationalError from patroni.utils import tzutc -from prettytable import PrettyTable, ALL -from unittest import mock -from unittest.mock import patch, Mock, PropertyMock -from urllib3 import PoolManager from . import MockConnect, MockCursor, MockResponse, psycopg_connect from .test_etcd import etcd_read, socket_getaddrinfo -from .test_ha import get_cluster_initialized_without_leader, get_cluster_initialized_with_leader, \ - get_cluster_initialized_with_only_leader, get_cluster_not_initialized_without_leader, get_cluster, Member +from .test_ha import get_cluster, get_cluster_initialized_with_leader, get_cluster_initialized_with_only_leader, \ + get_cluster_initialized_without_leader, get_cluster_not_initialized_without_leader, Member def get_default_config(*args): diff --git a/tests/test_etcd.py b/tests/test_etcd.py index 656edff3c..1fe48a8ec 100644 --- a/tests/test_etcd.py +++ b/tests/test_etcd.py @@ -1,18 +1,21 @@ -import etcd -import urllib3.util.connection import socket import unittest +from unittest.mock import Mock, patch, PropertyMock + +import etcd +import urllib3.util.connection + from dns.exception import DNSException +from urllib3.exceptions import ReadTimeoutError + from patroni.dcs import get_dcs -from patroni.dcs.etcd import AbstractDCS, EtcdClient, Cluster, Etcd, EtcdError, DnsCachingResolver +from patroni.dcs.etcd import AbstractDCS, Cluster, DnsCachingResolver, Etcd, EtcdClient, EtcdError from patroni.exceptions import DCSError from patroni.postgresql.mpp import get_mpp from patroni.utils import Retry -from unittest.mock import Mock, PropertyMock, patch -from urllib3.exceptions import ReadTimeoutError -from . import SleepException, MockResponse, requests_get +from . import MockResponse, requests_get, SleepException def etcd_watch(self, key, index=None, timeout=None, recursive=None): @@ -344,7 +347,7 @@ def test_set_ttl(self): self.assertTrue(self.etcd.watch(None, 1)) def test_sync_state(self): - self.assertIsNone(self.etcd.write_sync_state('leader', None)) + self.assertIsNone(self.etcd.write_sync_state('leader', None, 0)) self.assertFalse(self.etcd.delete_sync_state()) def test_set_history_value(self): diff --git a/tests/test_etcd3.py b/tests/test_etcd3.py index 2ea3699ef..54941130d 100644 --- a/tests/test_etcd3.py +++ b/tests/test_etcd3.py @@ -1,18 +1,20 @@ -import etcd import json import unittest + +from threading import Thread +from unittest.mock import Mock, patch, PropertyMock + +import etcd import urllib3 -from unittest.mock import Mock, PropertyMock, patch from patroni.dcs import get_dcs from patroni.dcs.etcd import DnsCachingResolver -from patroni.dcs.etcd3 import PatroniEtcd3Client, Cluster, Etcd3, Etcd3Client, \ - Etcd3Error, Etcd3ClientError, RetryFailedError, InvalidAuthToken, Unavailable, \ - Unknown, UnsupportedEtcdVersion, UserEmpty, AuthFailed, AuthOldRevision, base64_encode +from patroni.dcs.etcd3 import AuthFailed, AuthOldRevision, base64_encode, Cluster, Etcd3, \ + Etcd3Client, Etcd3ClientError, Etcd3Error, InvalidAuthToken, PatroniEtcd3Client, \ + RetryFailedError, Unavailable, Unknown, UnsupportedEtcdVersion, UserEmpty from patroni.postgresql.mpp import get_mpp -from threading import Thread -from . import SleepException, MockResponse +from . import MockResponse, SleepException def mock_urlopen(self, method, url, **kwargs): diff --git a/tests/test_exhibitor.py b/tests/test_exhibitor.py index 0620292ca..fae1586d5 100644 --- a/tests/test_exhibitor.py +++ b/tests/test_exhibitor.py @@ -1,12 +1,14 @@ import unittest -import urllib3 + from unittest.mock import Mock, patch +import urllib3 + from patroni.dcs import get_dcs -from patroni.dcs.exhibitor import ExhibitorEnsembleProvider, Exhibitor +from patroni.dcs.exhibitor import Exhibitor, ExhibitorEnsembleProvider from patroni.dcs.zookeeper import ZooKeeperError -from . import SleepException, requests_get +from . import requests_get, SleepException from .test_zookeeper import MockKazooClient diff --git a/tests/test_file_perm.py b/tests/test_file_perm.py index 09af602b6..afd1dfa4d 100644 --- a/tests/test_file_perm.py +++ b/tests/test_file_perm.py @@ -1,5 +1,6 @@ -import unittest import stat +import unittest + from unittest.mock import Mock, patch from patroni.file_perm import pg_perm @@ -30,3 +31,6 @@ def test_set_umask(self, mock_logger, mock_umask, mock_stat): def test_set_permissions_from_data_directory(self, mock_logger): pg_perm.set_permissions_from_data_directory('test') self.assertEqual(mock_logger.call_args[0][0], 'Can not check permissions on %s: %r') + + def test_orig_umask(self): + self.assertIsNotNone(pg_perm.orig_umask) diff --git a/tests/test_ha.py b/tests/test_ha.py index 3f624c828..d6f043589 100644 --- a/tests/test_ha.py +++ b/tests/test_ha.py @@ -1,16 +1,18 @@ import datetime -import etcd import os import sys -from unittest.mock import Mock, MagicMock, PropertyMock, patch, mock_open + +from unittest.mock import MagicMock, Mock, mock_open, patch, PropertyMock + +import etcd from patroni import global_config from patroni.collections import CaseInsensitiveSet from patroni.config import Config -from patroni.dcs import Cluster, ClusterConfig, Failover, Leader, Member, get_dcs, Status, SyncState, TimelineHistory +from patroni.dcs import Cluster, ClusterConfig, Failover, get_dcs, Leader, Member, Status, SyncState, TimelineHistory from patroni.dcs.etcd import AbstractEtcdClientWithFailover -from patroni.exceptions import DCSError, PostgresConnectionException, PatroniFatalException -from patroni.ha import Ha, _MemberStatus +from patroni.exceptions import DCSError, PatroniFatalException, PostgresConnectionException +from patroni.ha import _MemberStatus, Ha from patroni.postgresql import Postgresql from patroni.postgresql.bootstrap import Bootstrap from patroni.postgresql.cancellable import CancellableSubprocess @@ -18,11 +20,12 @@ from patroni.postgresql.postmaster import PostmasterProcess from patroni.postgresql.rewind import Rewind from patroni.postgresql.slots import SlotsHandler +from patroni.postgresql.sync import _SyncState from patroni.utils import tzutc from patroni.watchdog import Watchdog -from . import PostgresInit, MockPostmaster, psycopg_connect, requests_get -from .test_etcd import socket_getaddrinfo, etcd_read, etcd_write +from . import MockPostmaster, PostgresInit, psycopg_connect, requests_get +from .test_etcd import etcd_read, etcd_write, socket_getaddrinfo SYSID = '12345678901' @@ -63,7 +66,7 @@ def get_cluster_initialized_without_leader(leader=False, failover=None, sync=Non 'tags': {'clonefrom': True}, 'scheduled_restart': {'schedule': "2100-01-01 10:53:07.560445+00:00", 'postgres_version': '99.0.0'}}) - syncstate = SyncState(0 if sync else None, sync and sync[0], sync and sync[1]) + syncstate = SyncState(0 if sync else None, sync and sync[0], sync and sync[1], 0) failsafe = {m.name: m.api_url for m in (m1, m2)} if failsafe else None return get_cluster(SYSID, leader, [m1, m2], failover, syncstate, cluster_config, failsafe) @@ -118,11 +121,6 @@ def __init__(self, p, d): restapi: listen: 0.0.0.0:8008 bootstrap: - users: - replicator: - password: rep-pass - options: - - replication postgresql: name: foo data_dir: data/postgresql0 @@ -207,6 +205,7 @@ class TestHa(PostgresInit): @patch('patroni.dcs.dcs_modules', Mock(return_value=['patroni.dcs.etcd'])) @patch.object(etcd.Client, 'read', etcd_read) @patch.object(AbstractEtcdClientWithFailover, '_get_machines_list', Mock(return_value=['http://remotehost:2379'])) + @patch.object(Config, '_load_cache', Mock()) def setUp(self): super(TestHa, self).setUp() self.p.set_state('running') @@ -1309,7 +1308,7 @@ def test_demote_immediate(self, follow): self.ha.demote('immediate') follow.assert_called_once_with(None) - def test_process_sync_replication(self): + def test__process_multisync_replication(self): self.ha.has_lock = true mock_set_sync = self.p.sync_handler.set_synchronous_standby_names = Mock() mock_cfg_set_sync = self.p.config.set_synchronous_standby_names = Mock() @@ -1339,8 +1338,9 @@ def test_process_sync_replication(self): self.ha.is_synchronous_mode = true # Test sync standby not touched when picking the same node - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet(['other']), - CaseInsensitiveSet(['other']))) + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 1, 1, + CaseInsensitiveSet(['other']), + CaseInsensitiveSet(['other']))) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.run_cycle() mock_set_sync.assert_not_called() @@ -1349,15 +1349,17 @@ def test_process_sync_replication(self): mock_cfg_set_sync.reset_mock() # Test sync standby is replaced when switching standbys - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet(['other2']), CaseInsensitiveSet())) + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 0, 0, CaseInsensitiveSet(), + CaseInsensitiveSet(['other2']))) self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty()) self.ha.run_cycle() mock_set_sync.assert_called_once_with(CaseInsensitiveSet(['other2'])) mock_cfg_set_sync.assert_not_called() # Test sync standby is replaced when new standby is joined - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet(['other2', 'other3']), - CaseInsensitiveSet(['other2']))) + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 1, 1, + CaseInsensitiveSet(['other2']), + CaseInsensitiveSet(['other2', 'other3']))) self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty()) self.ha.run_cycle() self.assertEqual(mock_set_sync.call_args_list[0][0], (CaseInsensitiveSet(['other2']),)) @@ -1378,8 +1380,9 @@ def test_process_sync_replication(self): self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty()) self.ha.dcs.get_cluster = Mock(return_value=get_cluster_initialized_with_leader(sync=('leader', 'other'))) # self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet(['other2']), - CaseInsensitiveSet(['other2']))) + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 1, 1, + CaseInsensitiveSet(['other2']), + CaseInsensitiveSet(['other2']))) self.ha.run_cycle() self.assertEqual(self.ha.dcs.write_sync_state.call_count, 2) @@ -1402,9 +1405,10 @@ def test_process_sync_replication(self): # Test sync set to '*' when synchronous_mode_strict is enabled mock_set_sync.reset_mock() mock_cfg_set_sync.reset_mock() - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet(), CaseInsensitiveSet())) - self.ha.cluster.config.data['synchronous_mode_strict'] = True - self.ha.run_cycle() + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 0, 0, CaseInsensitiveSet(), + CaseInsensitiveSet())) + with patch.object(global_config.__class__, 'is_synchronous_mode_strict', PropertyMock(return_value=True)): + self.ha.run_cycle() mock_set_sync.assert_called_once_with(CaseInsensitiveSet('*')) mock_cfg_set_sync.assert_not_called() @@ -1432,8 +1436,8 @@ def test_sync_replication_become_primary(self): # When we just became primary nobody is sync self.assertEqual(self.ha.enforce_primary_role('msg', 'promote msg'), 'promote msg') - mock_set_sync.assert_called_once_with(CaseInsensitiveSet()) - mock_write_sync.assert_called_once_with('leader', None, version=0) + mock_set_sync.assert_called_once_with(CaseInsensitiveSet(), 0) + mock_write_sync.assert_called_once_with('leader', None, 0, version=0) mock_set_sync.reset_mock() @@ -1471,7 +1475,7 @@ def test_unhealthy_sync_mode(self): mock_acquire.assert_called_once() mock_follow.assert_not_called() mock_promote.assert_called_once() - mock_write_sync.assert_called_once_with('other', None, version=0) + mock_write_sync.assert_called_once_with('other', None, 0, version=0) def test_disable_sync_when_restarting(self): self.ha.is_synchronous_mode = true @@ -1513,7 +1517,8 @@ def test_enable_synchronous_mode(self): self.ha.is_synchronous_mode = true self.ha.has_lock = true self.p.name = 'leader' - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet(), CaseInsensitiveSet())) + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 0, 0, + CaseInsensitiveSet(), CaseInsensitiveSet())) self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty()) with patch('patroni.ha.logger.info') as mock_logger: self.ha.run_cycle() @@ -1529,7 +1534,8 @@ def test_inconsistent_synchronous_state(self): self.ha.has_lock = true self.p.name = 'leader' self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'a')) - self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet('a'), CaseInsensitiveSet())) + self.p.sync_handler.current_state = Mock(return_value=_SyncState('priority', 0, 0, + CaseInsensitiveSet(), CaseInsensitiveSet('a'))) self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty()) mock_set_sync = self.p.sync_handler.set_synchronous_standby_names = Mock() with patch('patroni.ha.logger.warning') as mock_logger: @@ -1699,3 +1705,113 @@ def test_notify_citus_coordinator(self): mock_logger.assert_called() self.assertTrue(mock_logger.call_args[0][0].startswith('Request to %s coordinator leader')) self.assertEqual(mock_logger.call_args[0][1], 'Citus') + + @patch.object(global_config.__class__, 'is_synchronous_mode', PropertyMock(return_value=True)) + @patch.object(global_config.__class__, 'is_quorum_commit_mode', PropertyMock(return_value=True)) + def test_process_sync_replication_prepromote(self): + self.p._major_version = 90500 + self.ha.cluster = get_cluster_initialized_without_leader(sync=('other', self.p.name + ',foo')) + self.p.is_primary = false + self.p.set_role('replica') + mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=None) + # Postgres 9.5, write_sync_state to DCS failed + self.assertEqual(self.ha.run_cycle(), + 'Postponing promotion because synchronous replication state was updated by somebody else') + self.assertEqual(self.ha.dcs.write_sync_state.call_count, 1) + self.assertEqual(mock_write_sync.call_args_list[0][0], (self.p.name, None, 0)) + self.assertEqual(mock_write_sync.call_args_list[0][1], {'version': 0}) + + mock_set_sync = self.p.config.set_synchronous_standby_names = Mock() + mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=True) + # Postgres 9.5, our name is written to leader of the /sync key, while voters list and ssn is empty + self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') + self.assertEqual(self.ha.dcs.write_sync_state.call_count, 1) + self.assertEqual(mock_write_sync.call_args_list[0][0], (self.p.name, None, 0)) + self.assertEqual(mock_write_sync.call_args_list[0][1], {'version': 0}) + self.assertEqual(mock_set_sync.call_count, 1) + self.assertEqual(mock_set_sync.call_args_list[0][0], (None,)) + + self.p._major_version = 90600 + mock_set_sync.reset_mock() + mock_write_sync.reset_mock() + self.p.set_role('replica') + # Postgres 9.6, with quorum commit we avoid updating /sync key and put some nodes to ssn + self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') + self.assertEqual(mock_write_sync.call_count, 0) + self.assertEqual(mock_set_sync.call_count, 1) + self.assertEqual(mock_set_sync.call_args_list[0][0], ('2 (foo,other)',)) + + self.p._major_version = 150000 + mock_set_sync.reset_mock() + self.p.set_role('replica') + self.p.name = 'nonsync' + self.ha.fetch_node_status = get_node_status() + # Postgres 15, with quorum commit. Non-sync node promoted we avoid updating /sync key and put some nodes to ssn + self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') + self.assertEqual(mock_write_sync.call_count, 0) + self.assertEqual(mock_set_sync.call_count, 1) + self.assertEqual(mock_set_sync.call_args_list[0][0], ('ANY 3 (foo,other,postgresql0)',)) + + @patch.object(global_config.__class__, 'is_synchronous_mode', PropertyMock(return_value=True)) + @patch.object(global_config.__class__, 'is_quorum_commit_mode', PropertyMock(return_value=True)) + def test__process_quorum_replication(self): + self.p._major_version = 150000 + self.ha.has_lock = true + mock_set_sync = self.p.config.set_synchronous_standby_names = Mock() + self.p.name = 'leader' + + mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=None) + # Test /sync key is attempted to set and failed when missing or invalid + self.p.sync_handler.current_state = Mock(return_value=_SyncState('quorum', 1, 1, CaseInsensitiveSet(['other']), + CaseInsensitiveSet(['other']))) + self.ha.run_cycle() + self.assertEqual(mock_write_sync.call_count, 1) + self.assertEqual(mock_write_sync.call_args_list[0][0], (self.p.name, None, 0)) + self.assertEqual(mock_write_sync.call_args_list[0][1], {'version': None}) + self.assertEqual(mock_set_sync.call_count, 0) + + self.ha._promote_timestamp = 1 + mock_write_sync = self.ha.dcs.write_sync_state = Mock(side_effect=[SyncState(None, self.p.name, None, 0), None]) + # Test /sync key is attempted to set and succeed when missing or invalid + with patch.object(SyncState, 'is_empty', Mock(side_effect=[True, False])): + self.ha.run_cycle() + self.assertEqual(mock_write_sync.call_count, 2) + self.assertEqual(mock_write_sync.call_args_list[0][0], (self.p.name, None, 0)) + self.assertEqual(mock_write_sync.call_args_list[0][1], {'version': None}) + self.assertEqual(mock_write_sync.call_args_list[1][0], (self.p.name, CaseInsensitiveSet(['other']), 0)) + self.assertEqual(mock_write_sync.call_args_list[1][1], {'version': None}) + self.assertEqual(mock_set_sync.call_count, 0) + + self.p.sync_handler.current_state = Mock(side_effect=[_SyncState('quorum', 1, 0, CaseInsensitiveSet(['foo']), + CaseInsensitiveSet(['other'])), + _SyncState('quorum', 1, 1, CaseInsensitiveSet(['foo']), + CaseInsensitiveSet(['foo']))]) + mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=SyncState(1, 'leader', 'foo', 0)) + self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'foo')) + # Test the sync node is removed from voters, added to ssn + with patch.object(Postgresql, 'synchronous_standby_names', Mock(return_value='other')), \ + patch('time.sleep', Mock()): + self.ha.run_cycle() + self.assertEqual(mock_write_sync.call_count, 1) + self.assertEqual(mock_write_sync.call_args_list[0][0], (self.p.name, CaseInsensitiveSet(), 0)) + self.assertEqual(mock_write_sync.call_args_list[0][1], {'version': 0}) + self.assertEqual(mock_set_sync.call_count, 1) + self.assertEqual(mock_set_sync.call_args_list[0][0], ('ANY 1 (other)',)) + + # Test ANY 1 (*) when synchronous_mode_strict and no nodes available + self.p.sync_handler.current_state = Mock(return_value=_SyncState('quorum', 1, 0, + CaseInsensitiveSet(['other', 'foo']), + CaseInsensitiveSet())) + mock_write_sync.reset_mock() + mock_set_sync.reset_mock() + with patch.object(global_config.__class__, 'is_synchronous_mode_strict', PropertyMock(return_value=True)): + self.ha.run_cycle() + self.assertEqual(mock_write_sync.call_count, 1) + self.assertEqual(mock_write_sync.call_args_list[0][0], (self.p.name, CaseInsensitiveSet(), 0)) + self.assertEqual(mock_write_sync.call_args_list[0][1], {'version': 0}) + self.assertEqual(mock_set_sync.call_count, 1) + self.assertEqual(mock_set_sync.call_args_list[0][0], ('ANY 1 (*)',)) + + # Test that _process_quorum_replication doesn't take longer than loop_wait + with patch('time.time', Mock(side_effect=[30, 60, 90, 120])): + self.ha.process_sync_replication() diff --git a/tests/test_kubernetes.py b/tests/test_kubernetes.py index 1bfd5b120..432158de9 100644 --- a/tests/test_kubernetes.py +++ b/tests/test_kubernetes.py @@ -4,16 +4,19 @@ import socket import time import unittest -import urllib3 + +from threading import Thread from unittest import mock -from unittest.mock import Mock, PropertyMock, mock_open, patch +from unittest.mock import Mock, mock_open, patch, PropertyMock + +import urllib3 from patroni.dcs import get_dcs from patroni.dcs.kubernetes import Cluster, k8s_client, k8s_config, K8sConfig, K8sConnectionFailed, \ - K8sException, K8sObject, Kubernetes, KubernetesError, KubernetesRetriableException, \ - Retry, RetryFailedError, SERVICE_HOST_ENV_NAME, SERVICE_PORT_ENV_NAME + K8sException, K8sObject, Kubernetes, KubernetesError, KubernetesRetriableException, Retry, \ + RetryFailedError, SERVICE_HOST_ENV_NAME, SERVICE_PORT_ENV_NAME from patroni.postgresql.mpp import get_mpp -from threading import Thread + from . import MockResponse, SleepException @@ -439,7 +442,7 @@ def test_delete_sync_state(self): @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_endpoints', mock_namespaced_kind, create=True) def test_write_sync_state(self): - self.assertIsNotNone(self.k.write_sync_state('a', ['b'], 1)) + self.assertIsNotNone(self.k.write_sync_state('a', ['b'], 0, 1)) @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_pod', mock_namespaced_kind, create=True) @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints', mock_namespaced_kind, create=True) diff --git a/tests/test_log.py b/tests/test_log.py index 3d4e79ab7..d0ab7ca80 100644 --- a/tests/test_log.py +++ b/tests/test_log.py @@ -2,11 +2,13 @@ import os import sys import unittest -import yaml + from io import StringIO -from queue import Queue, Full +from queue import Full, Queue from unittest.mock import Mock, patch +import yaml + from patroni.config import Config from patroni.log import PatroniLogger @@ -38,6 +40,7 @@ def test_patroni_logger(self): 'traceback_level': 'DEBUG', 'max_queue_size': 5, 'dir': 'foo', + 'mode': 0o600, 'file_size': 4096, 'file_num': 5, 'loggers': { @@ -50,7 +53,9 @@ def test_patroni_logger(self): os.environ[Config.PATRONI_CONFIG_VARIABLE] = yaml.dump(config, default_flow_style=False) logger = PatroniLogger() patroni_config = Config(None) - logger.reload_config(patroni_config['log']) + with patch('os.chmod') as mock_chmod: + logger.reload_config(patroni_config['log']) + self.assertEqual(mock_chmod.call_args[0][1], 0o600) _LOG.exception('test') logger.start() diff --git a/tests/test_mpp.py b/tests/test_mpp.py index 9eb876334..12e48778b 100644 --- a/tests/test_mpp.py +++ b/tests/test_mpp.py @@ -1,4 +1,5 @@ from typing import Any + from patroni.exceptions import PatroniException from patroni.postgresql.mpp import AbstractMPP, get_mpp, Null diff --git a/tests/test_patroni.py b/tests/test_patroni.py index fb5c0fc7c..e65b1870a 100644 --- a/tests/test_patroni.py +++ b/tests/test_patroni.py @@ -1,13 +1,18 @@ -import etcd import logging import os import signal import time import unittest -from unittest.mock import Mock, PropertyMock, patch -import patroni.config as config from http.server import HTTPServer +from threading import Thread +from unittest.mock import Mock, patch, PropertyMock + +import etcd + +import patroni.config as config + +from patroni.__main__ import check_psycopg, main as _main, Patroni from patroni.api import RestApiServer from patroni.async_executor import AsyncExecutor from patroni.dcs import Cluster, Member @@ -15,8 +20,6 @@ from patroni.exceptions import DCSError from patroni.postgresql import Postgresql from patroni.postgresql.config import ConfigHandler -from patroni.__main__ import check_psycopg, Patroni, main as _main -from threading import Thread from . import psycopg_connect, SleepException from .test_etcd import etcd_read, etcd_write diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py index ff76ade1d..c249bdbc7 100644 --- a/tests/test_postgresql.py +++ b/tests/test_postgresql.py @@ -1,13 +1,15 @@ import datetime import os -import psutil import re import subprocess import time from copy import deepcopy from pathlib import Path -from unittest.mock import Mock, MagicMock, PropertyMock, patch, mock_open +from threading import current_thread, Thread +from unittest.mock import MagicMock, Mock, mock_open, patch, PropertyMock + +import psutil import patroni.psycopg as psycopg @@ -15,22 +17,19 @@ from patroni.async_executor import CriticalTask from patroni.collections import CaseInsensitiveDict, CaseInsensitiveSet from patroni.dcs import RemoteMember -from patroni.exceptions import PostgresConnectionException, PatroniException -from patroni.postgresql import Postgresql, STATE_REJECT, STATE_NO_RESPONSE +from patroni.exceptions import PatroniException, PostgresConnectionException +from patroni.postgresql import Postgresql, STATE_NO_RESPONSE, STATE_REJECT from patroni.postgresql.bootstrap import Bootstrap from patroni.postgresql.callback_executor import CallbackAction -from patroni.postgresql.config import get_param_diff, _false_validator +from patroni.postgresql.config import _false_validator, get_param_diff from patroni.postgresql.postmaster import PostmasterProcess -from patroni.postgresql.validator import (ValidatorFactoryNoType, ValidatorFactoryInvalidType, - ValidatorFactoryInvalidSpec, ValidatorFactory, InvalidGucValidatorsFile, - _get_postgres_guc_validators, _read_postgres_gucs_validators_file, - _load_postgres_gucs_validators, Bool, Integer, Real, Enum, EnumBool, String) +from patroni.postgresql.validator import _get_postgres_guc_validators, _load_postgres_gucs_validators, \ + _read_postgres_gucs_validators_file, Bool, Enum, EnumBool, Integer, InvalidGucValidatorsFile, Real, String, \ + ValidatorFactory, ValidatorFactoryInvalidSpec, ValidatorFactoryInvalidType, ValidatorFactoryNoType from patroni.utils import RetryFailedError -from threading import Thread, current_thread - -from . import (BaseTestPostgresql, MockCursor, MockPostmaster, psycopg_connect, mock_available_gucs, - GET_PG_SETTINGS_RESULT) +from . import BaseTestPostgresql, GET_PG_SETTINGS_RESULT, \ + mock_available_gucs, MockCursor, MockPostmaster, psycopg_connect mtime_ret = {} diff --git a/tests/test_postmaster.py b/tests/test_postmaster.py index 0924e6f7b..1075902d8 100644 --- a/tests/test_postmaster.py +++ b/tests/test_postmaster.py @@ -1,7 +1,9 @@ import multiprocessing -import psutil import unittest -from unittest.mock import Mock, patch, mock_open + +from unittest.mock import Mock, mock_open, patch + +import psutil from patroni.postgresql.postmaster import PostmasterProcess diff --git a/tests/test_quorum.py b/tests/test_quorum.py new file mode 100644 index 000000000..2ae380fec --- /dev/null +++ b/tests/test_quorum.py @@ -0,0 +1,473 @@ +import unittest + +from typing import List, Set, Tuple + +from patroni.quorum import QuorumError, QuorumStateResolver + + +class QuorumTest(unittest.TestCase): + + def check_state_transitions(self, leader: str, quorum: int, voters: Set[str], numsync: int, sync: Set[str], + numsync_confirmed: int, active: Set[str], sync_wanted: int, leader_wanted: str, + expected: List[Tuple[str, str, int, Set[str]]]) -> None: + kwargs = { + 'leader': leader, 'quorum': quorum, 'voters': voters, + 'numsync': numsync, 'sync': sync, 'numsync_confirmed': numsync_confirmed, + 'active': active, 'sync_wanted': sync_wanted, 'leader_wanted': leader_wanted + } + result = list(QuorumStateResolver(**kwargs)) + self.assertEqual(result, expected) + + # also check interrupted transitions + if len(result) > 0 and result[0][0] != 'restart' and kwargs['leader'] == result[0][1]: + if result[0][0] == 'sync': + kwargs.update(numsync=result[0][2], sync=result[0][3]) + else: + kwargs.update(leader=result[0][1], quorum=result[0][2], voters=result[0][3]) + kwargs['expected'] = expected[1:] + self.check_state_transitions(**kwargs) + + def test_1111(self): + leader = 'a' + + # Add node + self.check_state_transitions(leader=leader, quorum=0, voters=set(), + numsync=0, sync=set(), numsync_confirmed=0, active=set('b'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 1, set('b')), + ('restart', leader, 0, set()), + ]) + self.check_state_transitions(leader=leader, quorum=0, voters=set(), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('b'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('b')) + ]) + + self.check_state_transitions(leader=leader, quorum=0, voters=set(), + numsync=0, sync=set(), numsync_confirmed=0, active=set('bcde'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bcde')), + ('restart', leader, 0, set()), + ]) + self.check_state_transitions(leader=leader, quorum=0, voters=set(), + numsync=2, sync=set('bcde'), numsync_confirmed=1, active=set('bcde'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 3, set('bcde')), + ]) + + def test_1222(self): + """2 node cluster""" + leader = 'a' + + # Active set matches state + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('b'), + sync_wanted=2, leader_wanted=leader, expected=[]) + + # Add node by increasing quorum + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('BC'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 1, set('bC')), + ('sync', leader, 1, set('bC')), + ]) + + # Add node by increasing sync + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('bc'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bc')), + ('quorum', leader, 1, set('bc')), + ]) + # Reduce quorum after added node caught up + self.check_state_transitions(leader=leader, quorum=1, voters=set('bc'), + numsync=2, sync=set('bc'), numsync_confirmed=2, active=set('bc'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('bc')), + ]) + + # Add multiple nodes by increasing both sync and quorum + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('BCdE'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bC')), + ('quorum', leader, 3, set('bCdE')), + ('sync', leader, 2, set('bCdE')), + ]) + # Reduce quorum after added nodes caught up + self.check_state_transitions(leader=leader, quorum=3, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=3, active=set('bcde'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 2, set('bcde')), + ]) + + # Primary is alone + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=1, sync=set('b'), numsync_confirmed=0, active=set(), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set()), + ('sync', leader, 0, set()), + ]) + + # Swap out sync replica + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=1, sync=set('b'), numsync_confirmed=0, active=set('c'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set()), + ('sync', leader, 1, set('c')), + ('restart', leader, 0, set()), + ]) + # Update quorum when added node caught up + self.check_state_transitions(leader=leader, quorum=0, voters=set(), + numsync=1, sync=set('c'), numsync_confirmed=1, active=set('c'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('c')), + ]) + + def test_1233(self): + """Interrupted transition from 2 node cluster to 3 node fully sync cluster""" + leader = 'a' + + # Node c went away, transition back to 2 node cluster + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=2, sync=set('bc'), numsync_confirmed=1, active=set('b'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 1, set('b')), + ]) + + # Node c is available transition to larger quorum set, but not yet caught up. + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=2, sync=set('bc'), numsync_confirmed=1, active=set('bc'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 1, set('bc')), + ]) + + # Add in a new node at the same time, but node c didn't caught up yet + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=2, sync=set('bc'), numsync_confirmed=1, active=set('bcd'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 2, set('bcd')), + ('sync', leader, 2, set('bcd')), + ]) + # All sync nodes caught up, reduce quorum + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcd'), + numsync=2, sync=set('bcd'), numsync_confirmed=3, active=set('bcd'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 1, set('bcd')), + ]) + + # Change replication factor at the same time + self.check_state_transitions(leader=leader, quorum=0, voters=set('b'), + numsync=2, sync=set('bc'), numsync_confirmed=1, active=set('bc'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 1, set('bc')), + ('sync', leader, 1, set('bc')), + ]) + + def test_2322(self): + """Interrupted transition from 2 node cluster to 3 node cluster with replication factor 2""" + leader = 'a' + + # Node c went away, transition back to 2 node cluster + self.check_state_transitions(leader=leader, quorum=1, voters=set('bc'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('b'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('b')), + ]) + + # Node c is available transition to larger quorum set. + self.check_state_transitions(leader=leader, quorum=1, voters=set('bc'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('bc'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('sync', leader, 1, set('bc')), + ]) + + # Add in a new node at the same time + self.check_state_transitions(leader=leader, quorum=1, voters=set('bc'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('bcd'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('sync', leader, 1, set('bc')), + ('quorum', leader, 2, set('bcd')), + ('sync', leader, 1, set('bcd')), + ]) + + # Convert to a fully synced cluster + self.check_state_transitions(leader=leader, quorum=1, voters=set('bc'), + numsync=1, sync=set('b'), numsync_confirmed=1, active=set('bc'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bc')), + ]) + # Reduce quorum after all nodes caught up + self.check_state_transitions(leader=leader, quorum=1, voters=set('bc'), + numsync=2, sync=set('bc'), numsync_confirmed=2, active=set('bc'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('bc')), + ]) + + def test_3535(self): + leader = 'a' + + # remove nodes + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=2, active=set('bc'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bc')), + ('quorum', leader, 0, set('bc')), + ]) + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=3, active=set('bcd'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bcd')), + ('quorum', leader, 1, set('bcd')), + ]) + + # remove nodes and decrease sync + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=2, active=set('bc'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bc')), + ('quorum', leader, 1, set('bc')), + ('sync', leader, 1, set('bc')), + ]) + self.check_state_transitions(leader=leader, quorum=1, voters=set('bcde'), + numsync=3, sync=set('bcde'), numsync_confirmed=2, active=set('bc'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('sync', leader, 3, set('bcd')), + ('quorum', leader, 1, set('bc')), + ('sync', leader, 1, set('bc')), + ]) + + # Increase replication factor and decrease quorum + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=2, active=set('bcde'), + sync_wanted=3, leader_wanted=leader, expected=[ + ('sync', leader, 3, set('bcde')), + ]) + # decrease quorum after more nodes caught up + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=3, sync=set('bcde'), numsync_confirmed=3, active=set('bcde'), + sync_wanted=3, leader_wanted=leader, expected=[ + ('quorum', leader, 1, set('bcde')), + ]) + + # Add node with decreasing sync and increasing quorum + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=2, active=set('bcdef'), + sync_wanted=1, leader_wanted=leader, expected=[ + # increase quorum by 2, 1 for added node and another for reduced sync + ('quorum', leader, 4, set('bcdef')), + # now reduce replication factor to requested value + ('sync', leader, 1, set('bcdef')), + ]) + + # Remove node with increasing sync and decreasing quorum + self.check_state_transitions(leader=leader, quorum=2, voters=set('bcde'), + numsync=2, sync=set('bcde'), numsync_confirmed=2, active=set('bcd'), + sync_wanted=3, leader_wanted=leader, expected=[ + # node e removed from sync wth replication factor increase + ('sync', leader, 3, set('bcd')), + # node e removed from voters with quorum decrease + ('quorum', leader, 1, set('bcd')), + ]) + + def test_remove_nosync_node(self): + leader = 'a' + self.check_state_transitions(leader=leader, quorum=0, voters=set('bc'), + numsync=2, sync=set('bc'), numsync_confirmed=1, active=set('b'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('b')), + ('sync', leader, 1, set('b')) + ]) + + def test_swap_sync_node(self): + leader = 'a' + self.check_state_transitions(leader=leader, quorum=0, voters=set('bc'), + numsync=2, sync=set('bc'), numsync_confirmed=1, active=set('bd'), + sync_wanted=2, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set('b')), + ('sync', leader, 2, set('bd')), + ('quorum', leader, 1, set('bd')) + ]) + + def test_promotion(self): + # Beginning stat: 'a' in the primary, 1 of bcd in sync + # a fails, c gets quorum votes and promotes + self.check_state_transitions(leader='a', quorum=2, voters=set('bcd'), + numsync=0, sync=set(), numsync_confirmed=0, active=set(), + sync_wanted=1, leader_wanted='c', expected=[ + ('sync', 'a', 1, set('abd')), # set a and b to sync + ('quorum', 'c', 2, set('abd')), # set c as a leader and move a to voters + # and stop because there are no active nodes + ]) + + # next loop, b managed to reconnect + self.check_state_transitions(leader='c', quorum=2, voters=set('abd'), + numsync=1, sync=set('abd'), numsync_confirmed=0, active=set('b'), + sync_wanted=1, leader_wanted='c', expected=[ + ('sync', 'c', 1, set('b')), # remove a from sync as inactive + ('quorum', 'c', 0, set('b')), # remove a from voters and reduce quorum + ]) + + # alternative reality: next loop, no one reconnected + self.check_state_transitions(leader='c', quorum=2, voters=set('abd'), + numsync=1, sync=set('abd'), numsync_confirmed=0, active=set(), + sync_wanted=1, leader_wanted='c', expected=[ + ('quorum', 'c', 0, set()), + ('sync', 'c', 0, set()), + ]) + + def test_nonsync_promotion(self): + # Beginning state: 1 of bc in sync. e.g. (a primary, ssn = ANY 1 (b c)) + # a fails, d sees b and c, knows that it is in sync and decides to promote. + # We include in sync state former primary increasing replication factor + # and let situation resolve. Node d ssn=ANY 1 (b c) + leader = 'd' + self.check_state_transitions(leader='a', quorum=1, voters=set('bc'), + numsync=0, sync=set(), numsync_confirmed=0, active=set(), + sync_wanted=1, leader_wanted=leader, expected=[ + # Set a, b, and c to sync and increase replication factor + ('sync', 'a', 2, set('abc')), + # Set ourselves as the leader and move the old leader to voters + ('quorum', leader, 1, set('abc')), + # and stop because there are no active nodes + ]) + # next loop, b and c managed to reconnect + self.check_state_transitions(leader=leader, quorum=1, voters=set('abc'), + numsync=2, sync=set('abc'), numsync_confirmed=0, active=set('bc'), + sync_wanted=1, leader_wanted=leader, expected=[ + ('sync', leader, 2, set('bc')), # Remove a from being synced to. + ('quorum', leader, 1, set('bc')), # Remove a from quorum + ('sync', leader, 1, set('bc')), # Can now reduce replication factor back + ]) + # alternative reality: next loop, no one reconnected + self.check_state_transitions(leader=leader, quorum=1, voters=set('abc'), + numsync=2, sync=set('abc'), numsync_confirmed=0, active=set(), + sync_wanted=1, leader_wanted=leader, expected=[ + ('quorum', leader, 0, set()), + ('sync', leader, 0, set()), + ]) + + def test_invalid_states(self): + leader = 'a' + + # Main invariant is not satisfied, system is in an unsafe state + resolver = QuorumStateResolver(leader=leader, quorum=0, voters=set('bc'), + numsync=1, sync=set('bc'), numsync_confirmed=1, + active=set('bc'), sync_wanted=1, leader_wanted=leader) + self.assertRaises(QuorumError, resolver.check_invariants) + self.assertEqual(list(resolver), [ + ('quorum', leader, 1, set('bc')) + ]) + + # Quorum and sync states mismatched, somebody other than Patroni modified system state + resolver = QuorumStateResolver(leader=leader, quorum=1, voters=set('bc'), + numsync=2, sync=set('bd'), numsync_confirmed=1, + active=set('bd'), sync_wanted=1, leader_wanted=leader) + self.assertRaises(QuorumError, resolver.check_invariants) + self.assertEqual(list(resolver), [ + ('quorum', leader, 1, set('bd')), + ('sync', leader, 1, set('bd')), + ]) + self.assertTrue(repr(resolver.sync).startswith('