From 8bfe16755322955519d8a6b82462e763b179ef65 Mon Sep 17 00:00:00 2001
From: Bruno Tatu <btatu@decem.evolix.net>
Date: Thu, 13 Apr 2023 17:50:27 +0200
Subject: [PATCH] =?UTF-8?q?Cr=C3=A9ation=20de=20la=20page=20des=20erreurs?=
 =?UTF-8?q?=20de=20Mysql?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 HowtoMySQL/Erreurs.md | 429 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 HowtoMySQL/Erreurs.md

diff --git a/HowtoMySQL/Erreurs.md b/HowtoMySQL/Erreurs.md
new file mode 100644
index 00000000..c484766d
--- /dev/null
+++ b/HowtoMySQL/Erreurs.md
@@ -0,0 +1,429 @@
+---
+categories: databases
+title: Howto Erreur MySQL
+...
+
+
+## Résolution des erreurs lors de la réplication
+
+On vérifie les erreurs avec les commandes `SHOW SLAVE STATUS` et `SHOW MASTER STATUS`.
+
+En cas d'erreur, il faut « simplement » résoudre l'erreur, puis relancer la réplication avec la commande `START SLAVE`. Voici quelques erreurs possibles :
+
+### Zapper l'erreur en cours
+
+On peut faire manuellement :
+
+~~~
+mysql> SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1; START SLAVE;
+~~~
+
+### Fichier de clé incorrect
+
+~~~
+Incorrect key file for table './base/table.MYI'; try to repair it
+~~~
+
+Il faut réparer la table concernée.
+
+### Doublon sur clé unique
+
+~~~
+Duplicate entry 'NNNNNN' for key N
+~~~
+
+Une solution *peut* être de supprimer la ligne concernée (ou de zapper l'erreur).
+
+### Beaucoup d'erreurs à ignorer
+
+Si pour une raison ou un autre, on a plein de `DUPLICATE ENTRY` mais que l'est **sûr** de vouloir les ignorer, on peut faire cela en redémarrant MySQL avec le paramètre : `slave-skip-errors = 1062` ; on peut faire également cela avec d'autres types d'erreurs. Malheureusement, il faut forcément redémarrer MySQL car cette commande ne se fait pas à chaud : <http://bugs.mysql.com/bug.php?id=35611>
+
+On peut également avoir d'autres erreurs, par exemple _Could not execute Delete_rows event on table foo.bar; Can't find record in 'bar', Error_code: 1032; handler error HA_ERR_KEY_NOT_FOUND; the event's master log [...]_ et on mettre cette fois `slave-skip-errors = 1032`
+
+Si plusieurs types d'erreur à ignorer : `slave-skip-errors = 1032,1062`
+
+L'inconvénient est qu'il faut redémarrer MySQL. Pour éviter cela on peut automatiser le zap de l'erreur.
+
+Exemple avec l'erreur _1062_ :
+
+~~~
+# while true; do while mysql -e "show slave status" | grep '1062.Error.*REPORT'; \
+ do mysql -e "SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1; START SLAVE;"; done ; sleep 1; done
+~~~
+
+Exemple avec l'erreur _1032_ (« Could not execute Delete_rows event ») :
+
+~~~
+# while true; do while mysql -e "show slave status" | grep 'Error_code: 1032'; \
+ do mysql -e "SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1; START SLAVE;"; done ; sleep 1; done
+~~~
+
+On peut également utilisé un script Shell plus évolué qui prendre les motifs à ignorer dans un fichier *error.txt*  (expressions rationnelles étendues, compatibles _grep -E_) et qui proposera de zapper manuellement (ou pas) si l'erreur ne correspondant aux motifs :
+
+~~~{.sh}
+#!/bin/sh
+
+# File containing error messages to skip (one per line).
+error_messages="errors.txt"
+
+# Sleep interval between 2 check.
+sleep_interval="1"
+
+# Exit when Seconds_Behind_Master reached 0.
+exit_when_uptodate="false"
+
+# Options to pass to mysql.
+#mysql_opt="-P 3307"
+
+# File to log skipped queries to (leave empty for no logs).
+log_file=""
+
+mysql_skip_error() {
+    error="$1"
+
+    printf "Skiping: $error\n"
+    mysql $mysql_opt -e 'SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1; START SLAVE;'
+
+    [ -n "$log_file" ] && echo "$error" >>"$log_file"
+}
+
+while true; do
+    slave_status="$(mysql $mysql_opt -e 'SHOW SLAVE STATUS\G')"
+    seconds_behind_master=$(echo "$slave_status" |grep 'Seconds_Behind_Master: ' |awk -F ' ' '{print $2}')
+    last_SQL_error="$(echo "$slave_status" |grep 'Last_SQL_Error: ' |sed 's/^.\+Last_SQL_Error: //')"
+
+    if $exit_when_uptodate && [ "$seconds_behind_master" = "0" ]; then
+        printf 'Replication is up to date!\n'
+        exit 0
+
+    elif [ -z "$last_SQL_error" ]; then
+        sleep $sleep_interval
+
+    elif echo "$last_SQL_error" |grep -q -f $error_messages; then
+        mysql_skip_error "$last_SQL_error"
+
+    else
+        printf "Current SQL error doesn't match the pattern:\n"
+        printf "$last_SQL_error\n"
+        printf "Skip it? [y/N]: "
+        read reply
+        if [ "$reply" = "y" ] || [ "$reply" = "Y" ]; then
+            mysql_skip_error $last_SQL_error
+        fi
+    fi
+done
+~~~
+
+### Récupération de position impossible
+
+~~~
+[ERROR] Error reading packet from server: Client requested master to start replication from impossible position (server_errno=1236)
+~~~
+
+Cela signifie que la position indiquée sur le binlog du master est impossible à récupérer. On peut le vérifier avec une commande du type `mysqlbinlog mysqld-bin.00123 --start-position=251` sur le master.
+
+Si l'on constate que le binlog est corrompu avec des erreurs du type _ERROR: Error in Log_event::read_log_event(): 'read error' # Warning: this binlog is either in use or was not closed properly._ ou _ERROR: Error in Log_event::read_log_event(): 'Event too small', data_len: 0, event_type: 0_ l'idée est d'identifier les requêtes non jouées sur le slave dans le binlog corrompu (voir le *Relay_Master_Log_File* via `SHOW SLAVE STATUS`) et de les rejouer (cf [HowtoMySQL#Replay]()) puis de passer au binlog suivant via une commande du type `CHANGE MASTER TO MASTER_LOG_FILE='mysql-bin.000124' , MASTER_LOG_POS=106; START SLAVE;` (la position à indiquer est souvent `106`, cf `mysqlbinlog mysql-bin.000124`).
+
+Si l'on juge cela non nécessaire (données non critiques), on pourra bien sûr passer directement au binlog suivant en ignorant les requêtes du binlog corrompu. Bien sûr, suite à ces manipulations risquées, on vérifiera ensuite la cohérence de la base de données répliquée (`COUNT(*)` ou outils plus avancés).
+
+### Could not parse relay log event entry
+
+~~~
+Could not parse relay log event entry. The possible reasons are: the master's binary log is corrupted (you can check this by running 'mysqlbinlog' on the binary log),
+the slave's relay log is corrupted (you can check this by running 'mysqlbinlog' on the relay log), a network problem, or a bug in the master's or slave's MySQL code.
+If you want to check the master's binary log or slave's relay log, you will be able to know their names by issuing 'SHOW SLAVE STATUS' on this slave.
+~~~
+
+Souvent un binlog corrompu, voir le *Relay_Master_Log_File* `SHOW SLAVE STATUS`.
+
+**Note**: Jusqu'à MySQL <= 5.1 au moins, changer la position dans un `Relay_log` avec un `CHANGE MASTER TO` ne marche pas. Voir [Changement de la position dans un Relay_log](#ChangementdelapositiondansunRelay_log).
+
+**Error 1594 - Relay log read failure - Could not parse relay log event entry**
+
+On peut changer la position du Relay_log, sur le serveur slave, il faut bien identifier sur quel `Relay_Master_Log_File` et a quelle position du `Exec_Master_Log_Pos` se trouve le slave.
+
+Il faut, avant toute chose, faire un `SHOW SLAVE STATUS\G` (copier la sortie complète dans un fichier à part, pour mémoire) pour voir les valeurs de `Relay_Master_Log_File` et `Exec_Master_Log_Pos` :
+
+~~~
+MariaDB [(none)]> SHOW SLAVE STATUS\G
+
+************************* 1. row ***************************
+
+               Slave_IO_State: Waiting for master to send event
+                  Master_Host: 192.168.194.74
+                  Master_User: replicator
+                  Master_Port: 3306
+                Connect_Retry: 60
+              Master_Log_File: mysql-bin.001274
+          Read_Master_Log_Pos: 1045327404
+               Relay_Log_File: 3_dbbackup.003821
+                Relay_Log_Pos: 617884398
+        Relay_Master_Log_File: mysql-bin.001273
+             Slave_IO_Running: Yes
+            Slave_SQL_Running: No
+              Replicate_Do_DB: 
+          Replicate_Ignore_DB: 
+           Replicate_Do_Table: 
+       Replicate_Ignore_Table: 
+      Replicate_Wild_Do_Table: 
+  Replicate_Wild_Ignore_Table: 
+                   Last_Errno: 1594
+                   Last_Error: Relay log read failure: Could not parse relay log event entry. The possible reasons are: the master's binary log is corrupted (you can check this by running 'mysqlbinlog' on the binary log), the slave's relay log is corrupted (you can check this by running 'mysqlbinlog' on the relay log), a network problem, or a bug in the master's or slave's MySQL code. If you want to check the master's binary log or slave's relay log, you will be able to know their names by issuing 'SHOW SLAVE STATUS' on this slave.
+                 Skip_Counter: 0
+          Exec_Master_Log_Pos: 617884110
+              Relay_Log_Space: 3192816253
+              Until_Condition: None
+               Until_Log_File: 
+                Until_Log_Pos: 0
+           Master_SSL_Allowed: No
+           Master_SSL_CA_File: 
+           Master_SSL_CA_Path: 
+              Master_SSL_Cert: 
+            Master_SSL_Cipher: 
+               Master_SSL_Key: 
+        Seconds_Behind_Master: NULL
+Master_SSL_Verify_Server_Cert: No
+                Last_IO_Errno: 0
+                Last_IO_Error: 
+               Last_SQL_Errno: 1594
+               Last_SQL_Error: Relay log read failure: Could not parse relay log event entry. The possible reasons are: the master's binary log is corrupted (you can check this by running 'mysqlbinlog' on the binary log), the slave's relay log is corrupted (you can check this by running 'mysqlbinlog' on the relay log), a network problem, or a bug in the master's or slave's MySQL code. If you want to check the master's binary log or slave's relay log, you will be able to know their names by issuing 'SHOW SLAVE STATUS' on this slave.
+  Replicate_Ignore_Server_Ids: 
+             Master_Server_Id: 13
+               Master_SSL_Crl: 
+           Master_SSL_Crlpath: 
+                   Using_Gtid: No
+                  Gtid_IO_Pos: 
+      Replicate_Do_Domain_Ids: 
+  Replicate_Ignore_Domain_Ids: 
+                Parallel_Mode: conservative
+~~~
+
+Il faut donc stopper le slave :
+
+~~~
+MariaDB [(none)]> STOP SLAVE;
+~~~
+
+Ensuite faire un `CHANGE MASTER TO` en indiquant le `Relay_Master_Log_File` et la position du `Exec_Master_Log_Pos` comme ceci dans l'exemple ci-dessus :
+
+~~~
+MariaDB [(none)]> CHANGE MASTER TO master_log_file='mysql-bin.001273', master_log_pos=617884110;
+
+MariaDB [(none)]> START SLAVE;
+~~~
+
+Normalement a ce stade là, la réplication continue à la position indiquée. Il se peut qu'il y ait des `Duplicate Entry`, qu'il faut alors étudier de près pour envisager de les sauter.
+
+
+### Erreur fatale à la lecture du binlog
+
+Erreur : `Got fatal error 1236 from master when reading data from binary log: 'log event entry exceeded max_allowed_packet; Increase max_allowed_packet on master'`
+
+On obtient apparemment cela dans différents cas.
+
+* L'un d'eux serait si max_allowed_packet est inférieur à read_buffer_size ; voir <http://www.mysqlperformanceblog.com/2012/06/06/read_buffer_size-can-break-your-replication/> ;
+* dans d'autre cas, il faudra forcer la réplication à se poursuivre via `STOP SLAVE; CHANGE MASTER TO MASTER_LOG_FILE='mysql-bin.00XXXX' , MASTER_LOG_POS=XXXX; START SLAVE;`
+* dans un autre cas, la position indiquée n'existe pas dans le binlog
+* enfin voir <http://dev.mysql.com/doc/refman/5.1/en/replication-features-max-allowed-packet.html>
+
+**Réinitialiser la réplication**
+
+Dans certains cas **exceptionnels**, une solution radicale est de réinitialiser la réplication avec un `STOP SLAVE; RESET SLAVE; START SLAVE;` Attention, cela doit être fait dans de très rares cas maîtrisés (attention notamment aux conflits _DUPLICATE ENTRY_ que cela risque de provoquer).
+
+**Status OK, mais pas de réplication**
+
+Si un `SHOW SLAVE STATUS` ne retourne pas d'erreur mais que la réplication ne se fait pas, les logs du slave peuvent contenir une erreur du type :
+
+~~~
+[Note] Slave I/O thread: Failed reading log event, reconnecting to retry, log 'mysql-bin.003357' at position 389449
+[Note] Slave: received end packet from server, apparent master shutdown:
+~~~
+
+Il se peut que le master se réplique sur 2 slaves ayant un server-id identique !
+
+
+### Erreur de checksum fatale
+
+Si vous rencontrez une erreur du type : 
+
+```
+Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'mysql-bin.015405' at 4, the last event read from 'mysql-bin.015405' at 4, the last byte read from 'mysql-bin.015405' at 256.'
+```
+
+Il est possible que l'erreur soit dûe au fait le master et le slave n'ont pas la même version.
+
+
+### ERROR 1201 lors de l'injection du dump
+
+Si lors de l'injection du dump sur le slave cette erreur apparaît :
+
+`ERROR 1201 (HY000) at line 22: Could not initialize master info structure`
+
+Il faut supprimer toute traces des anciennes réplications avec :
+
+~~~
+RESET SLAVE;
+~~~
+
+Et ensuite réinjecter le dump.
+ 
+### Changement de la position dans un Relay_log
+
+À faire uniquement si en tentant de changer la position d'un _Relay_log_ sur un slave, vous obtenez cette erreur :
+
+~~~
+Error initializing relay log position: Could not find target log during
+relay log initialization
+~~~
+
+Il faut alors stopper le processus slave de réplication :
+
+~~~
+mysql> STOP SLAVE;
+~~~
+
+Puis éditer (en gardant une sauvegarde) le fichier `${datadir}/relay-log.info`. La première ligne correspond au `Relay_Log_File`, la seconde au `Relay_Log_Pos`.
+Redémarrer MySQL.
+
+### Réintégrer dans la réplication une base qui aurait été exclue.
+
+Dans cet exemple, la base avait été exclue de la réplication, avec `replicate-ignore-db`, les lectures des requêtes de la base est ignoré sur le slave, mais le master continue d'écrire les requêtes de la base dans les binlogs.
+
+1 - On enlève l'exclusion de la base dans le fichier de configuration :
+
+~~~
+#replicate-ignore-db = foo
+~~~
+
+2 - On fait un dump de cette base sur le *master* :
+
+~~~
+# mysqldump foo > foo.sql 
+~~~
+
+3- On réinjecte ce dump sur le *slave* :
+
+~~~
+# mysql -o foo < foo.sql
+~~~
+
+4- On redémarre MySQL (ou l'instance) sur le *slave*, pour qu'il relise le fichier de configuration et ignore l'exclusion de la base sur le slave :
+
+~~~
+# systemctl restart mysql.service
+~~~
+
+S'il s'agit d'une instance, exemple si l'instance se trouve sur le port 3307 :
+
+~~~
+# mysqladmin -P 3307 shutdown
+# mysqld_multi start 1
+~~~
+
+En cas de réplication master - master, avec un slave de chaque côté, il faut le faire des deux côtés.
+
+Il existe d'autres méthodes, pour faire cela, celle-ci est la plus simple et elle a l'avantage de gérés les locks des tables, on ne doit pas le faire à la main.
+
+
+
+## Erreurs de réplication par "Channel" ou "Group réplication"
+
+Dans le cas d'une réplication par channel, ou Group Réplication, la sortie de `SHOW SLAVE STATUS\G' indique sur quels Channel il y a une erreur, mais sans forcément indiquer de manière explicite l'erreur en question.
+
+Exemple :
+
+~~~
+mysql> SHOW SLAVE STATUS\G
+
+.....
+
+ Exec_Master_Log_Pos: 18089090
+              Relay_Log_Space: 1104382466
+              Until_Condition: None
+               Until_Log_File: 
+                Until_Log_Pos: 0
+           Master_SSL_Allowed: No
+           Master_SSL_CA_File: 
+           Master_SSL_CA_Path: 
+              Master_SSL_Cert: 
+            Master_SSL_Cipher: 
+               Master_SSL_Key: 
+        Seconds_Behind_Master: NULL
+Master_SSL_Verify_Server_Cert: No
+                Last_IO_Errno: 0
+                Last_IO_Error: 
+               Last_SQL_Errno: 1064
+               Last_SQL_Error: Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker 1 failed executing transaction 'ANONYMOUS' at master log mysql-bin.000807, end_log_pos 18089265. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
+  Replicate_Ignore_Server_Ids: 
+             Master_Server_Id: 62
+                  Master_UUID: 
+             Master_Info_File: mysql.slave_master_info
+                    SQL_Delay: 0
+          SQL_Remaining_Delay: NULL
+      Slave_SQL_Running_State: 
+           Master_Retry_Count: 86400
+                  Master_Bind: 
+      Last_IO_Error_Timestamp: 
+     Last_SQL_Error_Timestamp: 200703 10:56:11
+               Master_SSL_Crl: 
+           Master_SSL_Crlpath: 
+           Retrieved_Gtid_Set: 
+            Executed_Gtid_Set: 
+                Auto_Position: 0
+         Replicate_Rewrite_DB: 
+                 Channel_Name: su6-sql2
+           Master_TLS_Version: 
+       Master_public_key_path: 
+        Get_master_public_key: 0
+            Network_Namespace: 
+
+~~~
+
+On vois que la réplication sur le Channel nommée `su6-sql2` est en erreur, mais on ne sait pas sur quelle requête.
+
+Le message d'erreur suggère de regarder les logs d'erreurs dans la table `performance_schema.replication_applier_status_by_worker`.
+
+On peut le faire comme ceci, en indiquant le Channel en erreur, et le numéro du Worker, comme indiqué dans le message d'erreur précédent:
+
+Exemple :
+
+~~~
+mysql> SELECT * FROM performance_schema.replication_applier_status_by_worker WHERE WORKER_ID=1 AND CHANNEL_NAME='su6-sql2'\G;
+
+*************************** 1. row ***************************
+                                           CHANNEL_NAME: su6-sql2
+                                              WORKER_ID: 1
+                                              THREAD_ID: NULL
+                                          SERVICE_STATE: OFF
+                                      LAST_ERROR_NUMBER: 1064
+                                     LAST_ERROR_MESSAGE: Worker 1 failed executing transaction 'ANONYMOUS' at master log mysql-bin.000807, end_log_pos 18089265; Error 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '=1' at line 1' on query. Default database: 'DB_content_cfbe'. Query: 'delete from toppops_cfbe_null_20200702_104153_V220 where rank=1'
+                                   LAST_ERROR_TIMESTAMP: 2020-07-03 10:56:11.801803
+                               LAST_APPLIED_TRANSACTION: 
+     LAST_APPLIED_TRANSACTION_ORIGINAL_COMMIT_TIMESTAMP: 0000-00-00 00:00:00.000000
+    LAST_APPLIED_TRANSACTION_IMMEDIATE_COMMIT_TIMESTAMP: 0000-00-00 00:00:00.000000
+         LAST_APPLIED_TRANSACTION_START_APPLY_TIMESTAMP: 0000-00-00 00:00:00.000000
+           LAST_APPLIED_TRANSACTION_END_APPLY_TIMESTAMP: 0000-00-00 00:00:00.000000
+                                   APPLYING_TRANSACTION: 
+         APPLYING_TRANSACTION_ORIGINAL_COMMIT_TIMESTAMP: 0000-00-00 00:00:00.000000
+        APPLYING_TRANSACTION_IMMEDIATE_COMMIT_TIMESTAMP: 0000-00-00 00:00:00.000000
+             APPLYING_TRANSACTION_START_APPLY_TIMESTAMP: 0000-00-00 00:00:00.000000
+                 LAST_APPLIED_TRANSACTION_RETRIES_COUNT: 0
+   LAST_APPLIED_TRANSACTION_LAST_TRANSIENT_ERROR_NUMBER: 0
+  LAST_APPLIED_TRANSACTION_LAST_TRANSIENT_ERROR_MESSAGE: 
+LAST_APPLIED_TRANSACTION_LAST_TRANSIENT_ERROR_TIMESTAMP: 0000-00-00 00:00:00.000000
+                     APPLYING_TRANSACTION_RETRIES_COUNT: 0
+       APPLYING_TRANSACTION_LAST_TRANSIENT_ERROR_NUMBER: 0
+      APPLYING_TRANSACTION_LAST_TRANSIENT_ERROR_MESSAGE: 
+    APPLYING_TRANSACTION_LAST_TRANSIENT_ERROR_TIMESTAMP: 0000-00-00 00:00:00.000000
+1 row in set (0.01 sec)
+
+ERROR: 
+No query specified
+~~~
+
+Comme ça on peut voir l'erreur explicite :
+
+~~~
+Error 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '=1' at line 1' on query. Default database: 'DB_content_cfbe'. Query: 'delete from toppops_cfbe_null_20200702_104153_V220 where rank=1'
+~~~
+