togethere.cloud/mds/admin_chat_encoding_repair.sql

75 lines
2.9 KiB
SQL

-- Repair script for mojibake in admin_chat_messages
-- Case handled here: UTF-8 Polish text was previously decoded as cp1250 before being stored.
-- Example: "też" became "teĹĽ", "chmurkę" became "chmurkÄ™".
--
-- Usage:
-- 1. Run the preview SELECT first and verify that repaired_preview looks correct.
-- 2. Run the backup INSERT.
-- 3. Run the UPDATE.
-- 4. If your preview looks better in repaired_preview_latin1 than in repaired_preview_cp1250,
-- use the latin1 variant from the commented section at the bottom instead.
USE `togethere_cloud`;
-- Step 1: preview suspicious rows before any update.
SELECT
id,
username,
message AS current_message,
CONVERT(CAST(CONVERT(message USING cp1250) AS BINARY) USING utf8mb4) AS repaired_preview_cp1250,
CONVERT(CAST(CONVERT(message USING latin1) AS BINARY) USING utf8mb4) AS repaired_preview_latin1,
created_at
FROM admin_chat_messages
WHERE message REGEXP 'Ä|Å|Ã|â|Ĺ|Ć|Ł'
ORDER BY id DESC;
-- Step 2: create a backup table if it does not exist yet.
CREATE TABLE IF NOT EXISTS admin_chat_messages_encoding_backup LIKE admin_chat_messages;
-- Step 3: backup only suspicious rows before repair.
INSERT INTO admin_chat_messages_encoding_backup
SELECT *
FROM admin_chat_messages
WHERE message REGEXP 'Ä|Å|Ã|â|Ĺ|Ć|Ł'
AND id NOT IN (
SELECT id FROM admin_chat_messages_encoding_backup
);
-- Step 4: repair messages using cp1250 reinterpretation.
UPDATE admin_chat_messages
SET message = CONVERT(CAST(CONVERT(message USING cp1250) AS BINARY) USING utf8mb4)
WHERE message REGEXP 'Ä|Å|Ã|â|Ĺ|Ć|Ł'
AND message <> CONVERT(CAST(CONVERT(message USING cp1250) AS BINARY) USING utf8mb4);
-- Step 5: verify result after repair.
SELECT id, username, message, created_at
FROM admin_chat_messages
WHERE id IN (
SELECT id
FROM admin_chat_messages_encoding_backup
)
ORDER BY id DESC;
-- Optional rollback if needed.
-- UPDATE admin_chat_messages m
-- JOIN admin_chat_messages_encoding_backup b ON b.id = m.id
-- SET m.user_id = b.user_id,
-- m.username = b.username,
-- m.message = b.message,
-- m.created_at = b.created_at,
-- m.reply_to_id = b.reply_to_id,
-- m.file_name = b.file_name,
-- m.file_mime = b.file_mime,
-- m.file_size = b.file_size,
-- m.file_data = b.file_data,
-- m.updated_at = b.updated_at,
-- m.is_hearted = b.is_hearted,
-- m.hearted_by_user_id = b.hearted_by_user_id,
-- m.hearted_by_username = b.hearted_by_username,
-- m.hearted_at = b.hearted_at;
-- Optional alternative for cases where preview shows latin1/cp1252-style mojibake instead.
-- UPDATE admin_chat_messages
-- SET message = CONVERT(CAST(CONVERT(message USING latin1) AS BINARY) USING utf8mb4)
-- WHERE message REGEXP 'Ä|Å|Ã|â|Ĺ|Ć|Ł'
-- AND message <> CONVERT(CAST(CONVERT(message USING latin1) AS BINARY) USING utf8mb4);