Source code for autils.devel.astring

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See LICENSE for more details.
#
# Copyright: Red Hat Inc. 2013-2014
# Authors: Lucas Meneghel Rodrigues <lmr@redhat.com>

"""Operations with strings (conversion and sanitation).

The unusual name aims to avoid causing name clashes with the stdlib module
string. Even with the dot notation, people may try to do things like

   import string
   ...
   from avocado.utils import string

And not notice until their code starts failing.
"""

import itertools
import locale
import re

from autils.file import path

#: On import evaluated value representing the system encoding
#: based on system locales using :func:`locale.getpreferredencoding`.
#: Use this value wisely as some files are dumped in different
#: encoding.
ENCODING = locale.getpreferredencoding()

#: String containing all fs-unfriendly chars (Windows-fat/Linux-ext3)
FS_UNSAFE_CHARS = '<>:"/\\|?*;'

# Translate table to replace fs-unfriendly chars
_FS_TRANSLATE = bytes.maketrans(bytes(FS_UNSAFE_CHARS, "ascii"), b"__________")



[docs]
def bitlist_to_string(data):
    """Transform from bit list to ASCII string.

    Converts a list of bits to an ASCII string representation.
    Only complete bytes (8 bits) are processed; partial bytes are ignored.

    :param data: List of integers representing bits to be transformed
    :type data: list[int]
    :returns: ASCII string representation of the bit list
    :rtype: str
    :raises UnicodeDecodeError: If the resulting byte values are not valid ASCII

    .. note::
       Only processes complete bytes. If the bit list length is not a
       multiple of 8, the remaining bits are ignored.

    .. rubric:: Example

    >>> bitlist_to_string([0, 1, 0, 0, 0, 0, 0, 1])  # 'A' = 65
    'A'
    >>> bitlist_to_string([1, 0, 0, 0])  # Incomplete byte
    ''
    """
    result = bytearray()
    c = 0
    for pos, bit in enumerate(data):
        c |= bit << (7 - (pos % 8))
        if (pos % 8) == 7:
            result.append(c)
            c = 0
    return result.decode("ascii")




[docs]
def string_to_bitlist(data):
    """Transform from ASCII string to bit list.

    Converts each character in the string to its 8-bit binary representation
    and returns a flat list of all bits.

    :param data: ASCII string to be transformed to bit list
    :type data: str
    :returns: List of integers representing the bits of each character
    :rtype: list[int]

    .. note::
       Each character produces exactly 8 bits, with the most significant bit first.

    .. rubric:: Example

    >>> string_to_bitlist('A')  # 'A' = 65 = 01000001
    [0, 1, 0, 0, 0, 0, 0, 1]
    >>> string_to_bitlist('AB')
    [0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]
    """
    ord_ = ord
    result = []
    append = result.append
    for ch in data:
        ascii_value = ord_(ch)
        for i in range(7, -1, -1):
            append((ascii_value >> i) & 1)
    return result




[docs]
def shell_escape(command):
    """Escape special characters from a command so that it can be passed
    as a double quoted (" ") string in a (ba)sh command.

    Escapes backslashes, dollar signs, double quotes, and backticks that
    have special meaning in bash when inside double quotes.

    :param command: The command string to escape
    :type command: str
    :returns: The escaped command string safe for shell execution
    :rtype: str
    """
    escape_chars = {"\\": "\\\\", "$": r"\$", '"': r"\"", "`": r"\`"}
    for char, escaped_char in escape_chars.items():
        command = command.replace(char, escaped_char)
    return command




[docs]
def strip_console_codes(output, custom_codes=None):
    """Remove Linux console escape and control sequences from console output.

    Removes ANSI escape sequences and other console control codes to make
    the output readable and suitable for result checking. Handles common
    codes used during system boot and terminal color formatting.

    :param output: The console output string containing escape sequences
    :type output: str
    :param custom_codes: Additional regex patterns for codes not covered
                        by the default patterns. Will be added to the
                        built-in console codes regex.
    :type custom_codes: str or None
    :returns: Clean string with all console escape sequences removed
    :rtype: str
    :raises ValueError: If unknown console codes are encountered that don't
                       match the known patterns

    .. note::
       If the output doesn't contain ``\\x1b`` (ESC character), the original
       string is returned unchanged for performance.

    .. rubric:: Supported Console Codes

    * ANSI color codes: ``\\x1b[31m``, ``\\x1b[0m``, etc.
    * Cursor positioning: ``\\x1b[H``, ``\\x1b[2J``, etc.
    * Character set selection: ``\\x1b(B``, ``\\x1b(0``, etc.
    * Custom codes via the ``custom_codes`` parameter

    .. rubric:: Example

    >>> strip_console_codes('\\x1b[31mRed Text\\x1b[0m')
    'Red Text'
    >>> strip_console_codes('Normal text')
    'Normal text'
    """
    if "\x1b" not in output:
        return output

    old_word = ""
    return_str = ""
    index = 0
    output = f"\x1b[m{output}"
    console_codes = "%[G@8]|\\[[@A-HJ-MPXa-hl-nqrsu\\`]"
    console_codes += "|\\[[\\d;]+[HJKgqnrm]|#8|\\([B0UK]|\\)"
    if custom_codes is not None and custom_codes not in console_codes:
        console_codes += f"|{custom_codes}"
    while index < len(output):
        tmp_index = 0
        tmp_word = ""
        while len(re.findall("\x1b", tmp_word)) < 2 and index + tmp_index < len(output):
            tmp_word += output[index + tmp_index]
            tmp_index += 1

        tmp_word = re.sub("\x1b", "", tmp_word)
        index += len(tmp_word) + 1
        if tmp_word == old_word:
            continue
        try:
            special_code = re.findall(console_codes, tmp_word)[0]
        except IndexError as exc:
            if index + tmp_index < len(output):
                raise ValueError(
                    f"{tmp_word} is not included in the known "
                    f"console codes list {console_codes}"
                ) from exc
            continue
        if special_code == tmp_word:
            continue
        old_word = tmp_word
        return_str += tmp_word[len(special_code) :]
    return return_str




[docs]
def iter_tabular_output(matrix, header=None, strip=False):
    """Generator for a pretty, aligned string representation of a nxm matrix.

    This representation can be used to print any tabular data, such as
    database results. It works by scanning the lengths of each element
    in each column, and determining the format string dynamically.

    :param matrix: Matrix representation (list with n rows of m elements).
    :type matrix: list
    :param header: Optional tuple or list with header elements to be displayed.
    :type header: tuple or list or None
    :param strip:  Optionally remove trailing whitespace from each row.
    :type strip: bool
    :returns: Generator yielding each formatted row of the tabular output
    :rtype: generator of str
    """

    lengths = []
    len_matrix = []
    str_matrix = []
    if header:
        matrix = itertools.chain([header], matrix)
    for row in matrix:
        len_matrix.append([])
        str_matrix.append([string_safe_encode(column) for column in row])
        for i, column in enumerate(str_matrix[-1]):
            col_len = len(strip_console_codes(column))
            len_matrix[-1].append(col_len)
            try:
                max_len = lengths[i]
                if col_len > max_len:
                    lengths[i] = col_len
            except IndexError:
                lengths.append(col_len)
        # For different no cols we need to calculate `lengths` of the last item
        # but later in `yield` we don't want it in `len_matrix`
        len_matrix[-1] = len_matrix[-1][:-1]

    if strip:

        def str_out(x):
            """Join list elements and strip trailing whitespace.

            :param x: List of string elements to join
            :type x: list
            :returns: Joined string with trailing whitespace removed
            :rtype: str
            """
            return " ".join(x).rstrip()

    else:

        def str_out(x):
            """Join list elements with spaces.

            :param x: List of string elements to join
            :type x: list
            :returns: Space-separated string of elements
            :rtype: str
            """
            return " ".join(x)

    for row, row_lens in zip(str_matrix, len_matrix):
        out = []
        padding = [" " * (lengths[i] - row_lens[i]) for i in range(len(row_lens))]
        out = ["%s%s" % line for line in zip(row, padding)]  # pylint: disable=C0209
        try:
            out.append(row[-1])
        except IndexError:
            continue  # Skip empty rows
        yield str_out(out)




[docs]
def tabular_output(matrix, header=None, strip=False):
    """Pretty, aligned string representation of a matrix.

    Creates a single formatted string with column-aligned tabular data,
    suitable for printing or logging. This is a convenience wrapper around
    :func:`iter_tabular_output` that joins all rows with newlines.

    :param matrix: Matrix representation as list of rows, where each row
                  is a list of column elements. Rows may have different lengths.
    :type matrix: list[list]
    :param header: Optional header row elements to be displayed at the top.
                  If provided, will be formatted with the same column alignment.
    :type header: list or tuple or None
    :param strip: If True, removes trailing whitespace from each output row
    :type strip: bool
    :returns: Complete formatted table as a single string with newline separators
    :rtype: str

    .. rubric:: Example

    >>> matrix = [['Alice', '25', 'Engineer'], ['Bob', '30', 'Designer']]
    >>> print(tabular_output(matrix, header=['Name', 'Age', 'Role']))
    Name  Age Role
    Alice 25  Engineer
    Bob   30  Designer

    .. seealso::
       :func:`iter_tabular_output` for the underlying generator implementation
    """
    return "\n".join(iter_tabular_output(matrix, header, strip))




[docs]
def string_safe_encode(input_str):
    """Safely convert any input to a string representation.

    Handles mixed unicode and encoded strings by ensuring all input
    is converted to a proper string type. In Python 3, this primarily
    serves to convert non-string types (numbers, objects) to strings.

    :param input_str: Input value that needs to be converted to string.
                     Can be string, numeric, or any object with __str__.
    :type input_str: Any
    :returns: String representation of the input
    :rtype: str

    .. note::
       On Python 3, encoding/decoding is handled automatically by the
       language, so this function focuses on type conversion rather than
       encoding management.

    .. rubric:: Supported Input Types

    * Strings: returned as-is
    * Numbers: converted using ``str()``
    * Objects: converted using their ``__str__`` method
    * None: converted to ``'None'``

    .. rubric:: Example

    >>> string_safe_encode('hello')
    'hello'
    >>> string_safe_encode(42)
    '42'
    >>> string_safe_encode([1, 2, 3])
    '[1, 2, 3]'
    """
    if not isinstance(input_str, str):
        input_str = str(input_str)
    return input_str




[docs]
def string_to_safe_path(input_str):
    """Convert string to a filesystem-safe filename or directory name.

    Sanitizes strings for use as filenames by replacing characters that are
    not allowed on common filesystems (FAT, NTFS, ext3/4) with underscores.
    Also handles length limits and hidden file conventions.

    :param input_str: String to be converted to a safe filename
    :type input_str: str
    :returns: Filesystem-safe string suitable for use as filename or directory name
    :rtype: str

    .. rubric:: Transformations Applied

    * Replaces unsafe characters with underscores: ``< > : " / \\ | ? * ;``
    * Limits length to filesystem maximum (typically 255 characters)
    * Converts hidden files (starting with ``.``) to start with ``_``
    * Handles Unicode characters that may cause encoding issues

    .. rubric:: Cross-Platform Compatibility

    The function ensures compatibility with:

    * **Windows**: FAT32, NTFS filesystems
    * **Linux**: ext3, ext4 filesystems
    * **macOS**: HFS+, APFS filesystems

    .. rubric:: Example

    >>> string_to_safe_path('my file: <test>.txt')
    'my file_ _test_.txt'
    >>> string_to_safe_path('.hidden_file')
    '_hidden_file'
    >>> string_to_safe_path('very_long_filename' * 20)  # Too long
    'very_long_filename...'  # Truncated to max length

    .. seealso::
       :data:`FS_UNSAFE_CHARS` for the complete list of replaced characters
    """
    max_length = path.get_max_file_name_length(input_str)

    if input_str.startswith("."):
        input_str = "_" + input_str[1:max_length]
    elif len(input_str) > max_length:
        input_str = input_str[:max_length]

    try:
        return input_str.translate(_FS_TRANSLATE)
    except TypeError:
        # Deal with incorrect encoding
        for bad_chr in FS_UNSAFE_CHARS:
            input_str = input_str.replace(bad_chr, "_")
        return input_str




[docs]
def is_bytes(data):
    """Check if the given data is a bytes object.

    Determines whether the input is specifically a ``bytes`` type,
    as opposed to a text string or other sequence type. This is useful
    for encoding/decoding operations and type-specific processing.

    :param data: The data instance to check
    :type data: Any
    :returns: True if data is a bytes object, False otherwise
    :rtype: bool

    .. note::
       This function specifically checks for the ``bytes`` type, not
       ``bytearray`` or other byte-like sequences.

    .. rubric:: Example

    >>> is_bytes(b'hello')
    True
    >>> is_bytes('hello')
    False
    >>> is_bytes(bytearray(b'hello'))
    False
    """
    return isinstance(data, bytes)




[docs]
def is_text(data):
    """Check if the given data is a text string.

    Determines whether the input is a string type capable of holding
    Unicode text with multi-byte characters, as opposed to a bytes
    sequence or other data type.

    :param data: The data instance to check
    :type data: Any
    :returns: True if data is a text string, False otherwise
    :rtype: bool

    .. note::
       In Python 3, this checks for the ``str`` type, which is Unicode-capable.

    .. rubric:: Example

    >>> is_text('hello')
    True
    >>> is_text(b'hello')
    False
    >>> is_text(42)
    False
    """
    return isinstance(data, str)




[docs]
def to_text(data, encoding=ENCODING, errors="strict"):
    """Convert any input to a text string.

    Universal text conversion function that handles bytes, strings, and
    other object types. Ensures consistent text output regardless of
    input type while preserving encoding semantics.

    :param data: Data to be converted to text string
    :type data: bytes or str or Any
    :param encoding: Character encoding to use when decoding bytes.
                    Uses system default if None.
    :type encoding: str or None
    :param errors: Error handling scheme for decoding failures.
                  See Python's codec error handlers.
    :type errors: str
    :returns: Text representation of the input data
    :rtype: str
    :raises UnicodeDecodeError: When bytes cannot be decoded with the
                               specified encoding and errors='strict'

    .. rubric:: Conversion Logic

    1. **bytes input**: Decoded using specified encoding
    2. **str input**: Returned unchanged
    3. **Other types**: Converted using ``str()`` function

    .. rubric:: Error Handling Options

    * ``'strict'``: Raise exception on decode errors (default)
    * ``'ignore'``: Skip invalid characters
    * ``'replace'``: Replace invalid characters with ``\\ufffd``
    * ``'backslashreplace'``: Replace with backslash escape sequences

    .. rubric:: Example

    >>> to_text(b'hello', 'utf-8')
    'hello'
    >>> to_text('already text')
    'already text'
    >>> to_text(42)
    '42'
    >>> to_text(b'\xff', 'utf-8', errors='ignore')
    ''

    .. seealso::
       `Python Codec Error Handlers
       <https://docs.python.org/3/library/codecs.html#error-handlers>`_
    """
    if is_bytes(data):
        if encoding is None:
            encoding = ENCODING
        return data.decode(encoding, errors=errors)
    if not isinstance(data, str):
        return str(data)
    return data