Handle non-ASCII identifiers in Ada

Ada allows non-ASCII identifiers, and GNAT supports several such encodings. This patch adds the corresponding support to gdb. GNAT encodes non-ASCII characters using special symbol names. For character sets like Latin-1, where all characters are a single byte, it uses a "U" followed by the hex for the character. So, for example, thorn would be encoded as "Ufe" (0xFE being lower case thorn). For wider characters, despite what the manual says (it claims Shift-JIS and EUC can be used), in practice recent versions only support Unicode. Here, characters in the base plane are represented using "Wxxxx" and characters outside the base plane using "WWxxxxxxxx". GNAT has some further quirks here. Ada is case-insensitive, and GNAT emits symbols that have been case-folded. For characters in ASCII, and for all characters in non-Unicode character sets, lower case is used. For Unicode, however, characters that fit in a single byte are converted to lower case, but all others are converted to upper case. Furthermore, there is a bug in GNAT where two symbols that differ only in the case of "Y WITH DIAERESIS" (and potentially others, I did not check exhaustively) can be used in one program. I chose to omit handling this case from gdb, on the theory that it is hard to figure out the logic, and anyway if the bug is ever fixed, we'll regret having a heuristic. This patch introduces a new "ada source-charset" setting. It defaults to Latin-1, as that is GNAT's default. This setting controls how "U" characters are decoded -- W/WW are always handled as UTF-32. The ada_tag_name_from_tsd change is needed because this function will read memory from the inferior and interpret it -- and this caused an encoding failure on PPC when running a test that tries to read uninitialized memory. This patch implements its own UTF-32-based case folder. This avoids host platform quirks, and is relatively simple. A short Python program to generate the case-folding table is included. It simply relies on whatever version of Unicode is used by the host Python, which seems basically acceptable. Test cases for UTF-8, Latin-1, and Latin-3 are included. This exercises most of the new code paths, aside from Y WITH DIAERESIS as noted above.
2022-02-03 10:42:07 -07:00 · 2022-02-03 10:42:07 -07:00 · 315e4ebb4b
commit 315e4ebb4b
parent ee3d464915
19 changed files with 2262 additions and 21 deletions
--- a/gdb/NEWS
+++ b/gdb/NEWS
@ -111,6 +111,12 @@ show style disassembler enabled
  package is available, then, when this setting is on, disassembler
  output will have styling applied.

+set ada source-charset
+show ada source-charset
+  Set the character set encoding that is assumed for Ada symbols.  Valid
+  values for this follow the values that can be passed to the GNAT
+  compiler via the '-gnati' option.  The default is ISO-8859-1.
+
 * Changed commands

 maint packet
--- a/gdb/ada-casefold.h
+++ b/gdb/ada-casefold.h
--- a/gdb/ada-exp.y
+++ b/gdb/ada-exp.y
@ -1549,10 +1549,14 @@ write_var_or_type (struct parser_state *par_state,
 	  int terminator = encoded_name[tail_index];

 	  encoded_name[tail_index] = '\0';
-	  std::vector<struct block_symbol> syms
-	    = ada_lookup_symbol_list (encoded_name, block, VAR_DOMAIN);
+	  /* In order to avoid double-encoding, we want to only pass
+	     the decoded form to lookup functions.  */
+	  std::string decoded_name = ada_decode (encoded_name);
 	  encoded_name[tail_index] = terminator;

+	  std::vector<struct block_symbol> syms
+	    = ada_lookup_symbol_list (decoded_name.c_str (), block, VAR_DOMAIN);
+
 	  type_sym = select_possible_type_sym (syms);

 	  if (type_sym != NULL)
@ -1626,7 +1630,7 @@ write_var_or_type (struct parser_state *par_state,
 	  else if (syms.empty ())
 	    {
 	      struct bound_minimal_symbol msym
-		= ada_lookup_simple_minsym (encoded_name);
+		= ada_lookup_simple_minsym (decoded_name.c_str ());
 	      if (msym.minsym != NULL)
 		{
 		  par_state->push_new<ada_var_msym_value_operation> (msym);
--- a/gdb/ada-lang.c
+++ b/gdb/ada-lang.c
@ -59,6 +59,7 @@
 #include "gdbsupport/byte-vector.h"
 #include <algorithm>
 #include "ada-exp.h"
+#include "charset.h"

 /* Define whether or not the C operator '/' truncates towards zero for
   differently signed operands (truncation direction is undefined in C).
@ -209,6 +210,38 @@ static symbol_name_matcher_ftype *ada_get_symbol_name_matcher



+/* The character set used for source files.  */
+static const char *ada_source_charset;
+
+/* The string "UTF-8".  This is here so we can check for the UTF-8
+   charset using == rather than strcmp.  */
+static const char ada_utf8[] = "UTF-8";
+
+/* Each entry in the UTF-32 case-folding table is of this form.  */
+struct utf8_entry
+{
+  /* The start and end, inclusive, of this range of codepoints.  */
+  uint32_t start, end;
+  /* The delta to apply to get the upper-case form.  0 if this is
+     already upper-case.  */
+  int upper_delta;
+  /* The delta to apply to get the lower-case form.  0 if this is
+     already lower-case.  */
+  int lower_delta;
+
+  bool operator< (uint32_t val) const
+  {
+    return end < val;
+  }
+};
+
+static const utf8_entry ada_case_fold[] =
+{
+#include "ada-casefold.h"
+};
+
+
+
 /* The result of a symbol lookup to be stored in our symbol cache.  */

 struct cache_entry
@ -843,6 +876,52 @@ is_compiler_suffix (const char *str)
  return *str == '\0' || (str[0] == ']' && str[1] == '\0');
 }

+/* Append a non-ASCII character to RESULT.  */
+static void
+append_hex_encoded (std::string &result, uint32_t one_char)
+{
+  if (one_char <= 0xff)
+    {
+      result.append ("U");
+      result.append (phex (one_char, 1));
+    }
+  else if (one_char <= 0xffff)
+    {
+      result.append ("W");
+      result.append (phex (one_char, 2));
+    }
+  else
+    {
+      result.append ("WW");
+      result.append (phex (one_char, 4));
+    }
+}
+
+/* Return a string that is a copy of the data in STORAGE, with
+   non-ASCII characters replaced by the appropriate hex encoding.  A
+   template is used because, for UTF-8, we actually want to work with
+   UTF-32 codepoints.  */
+template<typename T>
+std::string
+copy_and_hex_encode (struct obstack *storage)
+{
+  const T *chars = (T *) obstack_base (storage);
+  int num_chars = obstack_object_size (storage) / sizeof (T);
+  std::string result;
+  for (int i = 0; i < num_chars; ++i)
+    {
+      if (chars[i] <= 0x7f)
+	{
+	  /* The host character set has to be a superset of ASCII, as
+	     are all the other character sets we can use.  */
+	  result.push_back (chars[i]);
+	}
+      else
+	append_hex_encoded (result, chars[i]);
+    }
+  return result;
+}
+
 /* The "encoded" form of DECODED, according to GNAT conventions.  If
   THROW_ERRORS, throw an error if invalid operator name is found.
   Otherwise, return the empty string in that case.  */
@ -854,8 +933,12 @@ ada_encode_1 (const char *decoded, bool throw_errors)
    return {};

  std::string encoding_buffer;
+  bool saw_non_ascii = false;
  for (const char *p = decoded; *p != '\0'; p += 1)
    {
+      if ((*p & 0x80) != 0)
+	saw_non_ascii = true;
+
      if (*p == '.')
 	encoding_buffer.append ("__");
      else if (*p == '[' && is_compiler_suffix (p))
@ -887,23 +970,70 @@ ada_encode_1 (const char *decoded, bool throw_errors)
 	encoding_buffer.push_back (*p);
    }

+  /* If a non-ASCII character is seen, we must convert it to the
+     appropriate hex form.  As this is more expensive, we keep track
+     of whether it is even necessary.  */
+  if (saw_non_ascii)
+    {
+      auto_obstack storage;
+      bool is_utf8 = ada_source_charset == ada_utf8;
+      try
+	{
+	  convert_between_encodings
+	    (host_charset (),
+	     is_utf8 ? HOST_UTF32 : ada_source_charset,
+	     (const gdb_byte *) encoding_buffer.c_str (),
+	     encoding_buffer.length (), 1,
+	     &storage, translit_none);
+	}
+      catch (const gdb_exception &)
+	{
+	  static bool warned = false;
+
+	  /* Converting to UTF-32 shouldn't fail, so if it doesn't, we
+	     might like to know why.  */
+	  if (!warned)
+	    {
+	      warned = true;
+	      warning (_("charset conversion failure for '%s'.\n"
+			 "You may have the wrong value for 'set ada source-charset'."),
+		       encoding_buffer.c_str ());
+	    }
+
+	  /* We don't try to recover from errors.  */
+	  return encoding_buffer;
+	}
+
+      if (is_utf8)
+	return copy_and_hex_encode<uint32_t> (&storage);
+      return copy_and_hex_encode<gdb_byte> (&storage);
+    }
+
  return encoding_buffer;
 }

-/* The "encoded" form of DECODED, according to GNAT conventions.  */
-
-std::string
-ada_encode (const char *decoded)
+/* Find the entry for C in the case-folding table.  Return nullptr if
+   the entry does not cover C.  */
+static const utf8_entry *
+find_case_fold_entry (uint32_t c)
 {
-  return ada_encode_1 (decoded, true);
+  auto iter = std::lower_bound (std::begin (ada_case_fold),
+				std::end (ada_case_fold),
+				c);
+  if (iter == std::end (ada_case_fold)
+      || c < iter->start
+      || c > iter->end)
+    return nullptr;
+  return &*iter;
 }

 /* Return NAME folded to lower case, or, if surrounded by single
-   quotes, unfolded, but with the quotes stripped away.  Result good
-   to next call.  */
+   quotes, unfolded, but with the quotes stripped away.  If
+   THROW_ON_ERROR is true, encoding failures will throw an exception
+   rather than emitting a warning.  Result good to next call.  */

 static const char *
-ada_fold_name (gdb::string_view name)
+ada_fold_name (gdb::string_view name, bool throw_on_error = false)
 {
  static std::string fold_storage;

@ -911,14 +1041,120 @@ ada_fold_name (gdb::string_view name)
    fold_storage = gdb::to_string (name.substr (1, name.size () - 2));
  else
    {
-      fold_storage = gdb::to_string (name);
-      for (int i = 0; i < name.size (); i += 1)
-	fold_storage[i] = tolower (fold_storage[i]);
+      /* Why convert to UTF-32 and implement our own case-folding,
+	 rather than convert to wchar_t and use the platform's
+	 functions?  I'm glad you asked.
+
+	 The main problem is that GNAT implements an unusual rule for
+	 case folding.  For ASCII letters, letters in single-byte
+	 encodings (such as ISO-8859-*), and Unicode letters that fit
+	 in a single byte (i.e., code point is <= 0xff), the letter is
+	 folded to lower case.  Other Unicode letters are folded to
+	 upper case.
+
+	 This rule means that the code must be able to examine the
+	 value of the character.  And, some hosts do not use Unicode
+	 for wchar_t, so examining the value of such characters is
+	 forbidden.  */
+      auto_obstack storage;
+      try
+	{
+	  convert_between_encodings
+	    (host_charset (), HOST_UTF32,
+	     (const gdb_byte *) name.data (),
+	     name.length (), 1,
+	     &storage, translit_none);
+	}
+      catch (const gdb_exception &)
+	{
+	  if (throw_on_error)
+	    throw;
+
+	  static bool warned = false;
+
+	  /* Converting to UTF-32 shouldn't fail, so if it doesn't, we
+	     might like to know why.  */
+	  if (!warned)
+	    {
+	      warned = true;
+	      warning (_("could not convert '%s' from the host encoding (%s) to UTF-32.\n"
+			 "This normally should not happen, please file a bug report."),
+		       gdb::to_string (name).c_str (), host_charset ());
+	    }
+
+	  /* We don't try to recover from errors; just return the
+	     original string.  */
+	  fold_storage = gdb::to_string (name);
+	  return fold_storage.c_str ();
+	}
+
+      bool is_utf8 = ada_source_charset == ada_utf8;
+      uint32_t *chars = (uint32_t *) obstack_base (&storage);
+      int num_chars = obstack_object_size (&storage) / sizeof (uint32_t);
+      for (int i = 0; i < num_chars; ++i)
+	{
+	  const struct utf8_entry *entry = find_case_fold_entry (chars[i]);
+	  if (entry != nullptr)
+	    {
+	      uint32_t low = chars[i] + entry->lower_delta;
+	      if (!is_utf8 || low <= 0xff)
+		chars[i] = low;
+	      else
+		chars[i] = chars[i] + entry->upper_delta;
+	    }
+	}
+
+      /* Now convert back to ordinary characters.  */
+      auto_obstack reconverted;
+      try
+	{
+	  convert_between_encodings (HOST_UTF32,
+				     host_charset (),
+				     (const gdb_byte *) chars,
+				     num_chars * sizeof (uint32_t),
+				     sizeof (uint32_t),
+				     &reconverted,
+				     translit_none);
+	  obstack_1grow (&reconverted, '\0');
+	  fold_storage = std::string ((const char *) obstack_base (&reconverted));
+	}
+      catch (const gdb_exception &)
+	{
+	  if (throw_on_error)
+	    throw;
+
+	  static bool warned = false;
+
+	  /* Converting back from UTF-32 shouldn't normally fail, but
+	     there are some host encodings without upper/lower
+	     equivalence.  */
+	  if (!warned)
+	    {
+	      warned = true;
+	      warning (_("could not convert the lower-cased variant of '%s'\n"
+			 "from UTF-32 to the host encoding (%s)."),
+		       gdb::to_string (name).c_str (), host_charset ());
+	    }
+
+	  /* We don't try to recover from errors; just return the
+	     original string.  */
+	  fold_storage = gdb::to_string (name);
+	}
    }

  return fold_storage.c_str ();
 }

+/* The "encoded" form of DECODED, according to GNAT conventions.  */
+
+std::string
+ada_encode (const char *decoded)
+{
+  if (decoded[0] != '<')
+    decoded = ada_fold_name (decoded);
+  return ada_encode_1 (decoded, true);
+}
+
 /* Return nonzero if C is either a digit or a lowercase alphabet character.  */

 static int
@ -999,6 +1235,72 @@ remove_compiler_suffix (const char *encoded, int *len)
  return -1;
 }

+/* Convert an ASCII hex string to a number.  Reads exactly N
+   characters from STR.  Returns true on success, false if one of the
+   digits was not a hex digit.  */
+static bool
+convert_hex (const char *str, int n, uint32_t *out)
+{
+  uint32_t result = 0;
+
+  for (int i = 0; i < n; ++i)
+    {
+      if (!isxdigit (str[i]))
+	return false;
+      result <<= 4;
+      result |= fromhex (str[i]);
+    }
+
+  *out = result;
+  return true;
+}
+
+/* Convert a wide character from its ASCII hex representation in STR
+   (consisting of exactly N characters) to the host encoding,
+   appending the resulting bytes to OUT.  If N==2 and the Ada source
+   charset is not UTF-8, then hex refers to an encoding in the
+   ADA_SOURCE_CHARSET; otherwise, use UTF-32.  Return true on success.
+   Return false and do not modify OUT on conversion failure.  */
+static bool
+convert_from_hex_encoded (std::string &out, const char *str, int n)
+{
+  uint32_t value;
+
+  if (!convert_hex (str, n, &value))
+    return false;
+  try
+    {
+      auto_obstack bytes;
+      /* In the 'U' case, the hex digits encode the character in the
+	 Ada source charset.  However, if the source charset is UTF-8,
+	 this really means it is a single-byte UTF-32 character.  */
+      if (n == 2 && ada_source_charset != ada_utf8)
+	{
+	  gdb_byte one_char = (gdb_byte) value;
+
+	  convert_between_encodings (ada_source_charset, host_charset (),
+				     &one_char,
+				     sizeof (one_char), sizeof (one_char),
+				     &bytes, translit_none);
+	}
+      else
+	convert_between_encodings (HOST_UTF32, host_charset (),
+				   (const gdb_byte *) &value,
+				   sizeof (value), sizeof (value),
+				   &bytes, translit_none);
+      obstack_1grow (&bytes, '\0');
+      out.append ((const char *) obstack_base (&bytes));
+    }
+  catch (const gdb_exception &)
+    {
+      /* On failure, the caller will just let the encoded form
+	 through, which seems basically reasonable.  */
+      return false;
+    }
+
+  return true;
+}
+
 /* See ada-lang.h.  */

 std::string
@ -1191,6 +1493,32 @@ ada_decode (const char *encoded, bool wrap)
 	    i++;
 	}

+      if (i < len0 + 3 && encoded[i] == 'U' && isxdigit (encoded[i + 1]))
+	{
+	  if (convert_from_hex_encoded (decoded, &encoded[i + 1], 2))
+	    {
+	      i += 3;
+	      continue;
+	    }
+	}
+      else if (i < len0 + 5 && encoded[i] == 'W' && isxdigit (encoded[i + 1]))
+	{
+	  if (convert_from_hex_encoded (decoded, &encoded[i + 1], 4))
+	    {
+	      i += 5;
+	      continue;
+	    }
+	}
+      else if (i < len0 + 10 && encoded[i] == 'W' && encoded[i + 1] == 'W'
+	       && isxdigit (encoded[i + 2]))
+	{
+	  if (convert_from_hex_encoded (decoded, &encoded[i + 2], 8))
+	    {
+	      i += 10;
+	      continue;
+	    }
+	}
+
      if (encoded[i] == 'X' && i != 0 && isalnum (encoded[i - 1]))
 	{
 	  /* This is a X[bn]* sequence not separated from the previous
@ -6212,7 +6540,6 @@ ada_get_tsd_from_tag (struct value *tag)
 static gdb::unique_xmalloc_ptr<char>
 ada_tag_name_from_tsd (struct value *tsd)
 {
-  char *p;
  struct value *val;

  val = ada_value_struct_elt (tsd, "expanded_name", 1);
@ -6223,13 +6550,18 @@ ada_tag_name_from_tsd (struct value *tsd)
  if (buffer == nullptr)
    return nullptr;

-  for (p = buffer.get (); *p != '\0'; ++p)
+  try
    {
-      if (isalpha (*p))
-	*p = tolower (*p);
+      /* Let this throw an exception on error.  If the data is
+	 uninitialized, we'd rather not have the user see a
+	 warning.  */
+      const char *folded = ada_fold_name (buffer.get (), true);
+      return make_unique_xstrdup (folded);
+    }
+  catch (const gdb_exception &)
+    {
+      return nullptr;
    }
-
-  return buffer;
 }

 /* The type name of the dynamic type denoted by the 'tag value TAG, as
@ -13435,6 +13767,26 @@ ada_free_objfile_observer (struct objfile *objfile)
  ada_clear_symbol_cache ();
 }

+/* Charsets known to GNAT.  */
+static const char * const gnat_source_charsets[] =
+{
+  /* Note that code below assumes that the default comes first.
+     Latin-1 is the default here, because that is also GNAT's
+     default.  */
+  "ISO-8859-1",
+  "ISO-8859-2",
+  "ISO-8859-3",
+  "ISO-8859-4",
+  "ISO-8859-5",
+  "ISO-8859-15",
+  "CP437",
+  "CP850",
+  /* Note that this value is special-cased in the encoder and
+     decoder.  */
+  ada_utf8,
+  nullptr
+};
+
 void _initialize_ada_language ();
 void
 _initialize_ada_language ()
@ -13470,6 +13822,17 @@ Show whether the output of formal and return types for functions in the \
 overloads selection menu is activated."),
 			   NULL, NULL, NULL, &set_ada_list, &show_ada_list);

+  ada_source_charset = gnat_source_charsets[0];
+  add_setshow_enum_cmd ("source-charset", class_files,
+			gnat_source_charsets,
+			&ada_source_charset,  _("\
+Set the Ada source character set."), _("\
+Show the Ada source character set."), _("\
+The character set used for Ada source files.\n\
+This must correspond to the '-gnati' or '-gnatW' option passed to GNAT."),
+			nullptr, nullptr,
+			&set_ada_list, &show_ada_list);
+
  add_catch_command ("exception", _("\
 Catch Ada exceptions, when raised.\n\
 Usage: catch exception [ARG] [if CONDITION]\n\
--- a/gdb/ada-lex.l
+++ b/gdb/ada-lex.l
@ -30,7 +30,7 @@ HEXDIG	[0-9a-f]
 NUM16	({HEXDIG}({HEXDIG}|_)*)
 OCTDIG	[0-7]
 LETTER	[a-z_]
-ID	({LETTER}({LETTER}|{DIG})*|"<"{LETTER}({LETTER}|{DIG})*">")
+ID	({LETTER}({LETTER}|{DIG}|[\x80-\xff])*|"<"{LETTER}({LETTER}|{DIG})*">")
 WHITE	[ \t\n]
 TICK	("'"{WHITE}*)
 GRAPHIC [a-z0-9 #&'()*+,-./:;<>=_|!$%?@\[\]\\^`{}~]
--- a/gdb/ada-unicode.py
+++ b/gdb/ada-unicode.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+# Generate Unicode case-folding table for Ada.
+
+# Copyright (C) 2022 Free Software Foundation, Inc.
+
+# This file is part of GDB.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# This generates the ada-casefold.h header.
+# Usage:
+#   python ada-unicode.py
+
+import gdbcopyright
+
+# The start of the current range of case-conversions we are
+# processing.  If RANGE_START is None, then we're outside of a range.
+range_start = None
+# End of the current range.
+range_end = None
+# The delta between RANGE_START and the upper-case variant of that
+# character.
+upper_delta = None
+# The delta between RANGE_START and the lower-case variant of that
+# character.
+lower_delta = None
+
+# All the ranges found and completed so far.
+# Each entry is a tuple of the form (START, END, UPPER_DELTA, LOWER_DELTA).
+all_ranges = []
+
+
+def finish_range():
+    global range_start
+    global range_end
+    global upper_delta
+    global lower_delta
+    if range_start is not None:
+        all_ranges.append((range_start, range_end, upper_delta, lower_delta))
+        range_start = None
+        range_end = None
+        upper_delta = None
+        lower_delta = None
+
+
+def process_codepoint(val):
+    global range_start
+    global range_end
+    global upper_delta
+    global lower_delta
+    c = chr(val)
+    low = c.lower()
+    up = c.upper()
+    # U+00DF ("LATIN SMALL LETTER SHARP S", aka eszsett) traditionally
+    # upper-cases to the two-character string "SS" (the capital form
+    # is a relatively recent addition -- 2017).  Our simple scheme
+    # can't handle this, so we skip it.  Also, because our approach
+    # just represents runs of characters with identical folding
+    # deltas, this change must terminate the current run.
+    if (c == low and c == up) or len(low) != 1 or len(up) != 1:
+        finish_range()
+        return
+    updelta = ord(up) - val
+    lowdelta = ord(low) - val
+    if range_start is not None and (updelta != upper_delta or lowdelta != lower_delta):
+        finish_range()
+    if range_start is None:
+        range_start = val
+        upper_delta = updelta
+        lower_delta = lowdelta
+    range_end = val
+
+
+for c in range(0, 0x10FFFF):
+    process_codepoint(c)
+
+with open("ada-casefold.h", "w") as f:
+    print(
+        gdbcopyright.copyright("ada-unicode.py", "UTF-32 case-folding for GDB"),
+        file=f,
+    )
+    for r in all_ranges:
+        print(f"   {{{r[0]}, {r[1]}, {r[2]}, {r[3]}}},", file=f)
--- a/gdb/doc/gdb.texinfo
+++ b/gdb/doc/gdb.texinfo
@ -18012,6 +18012,7 @@ to be difficult.
 * Ravenscar Profile::           Tasking Support when using the Ravenscar
                                   Profile
 * Ada Settings::                New settable GDB parameters for Ada.
+* Ada Source Character Set::    Character set of Ada source files.
 * Ada Glitches::                Known peculiarities of Ada mode.
@end menu

@ -18762,6 +18763,28 @@ size is less than @var{size}.
 Show the limit on types whose size is determined by run-time quantities.
@end table

+@node Ada Source Character Set
+@subsubsection Ada Source Character Set
+@cindex Ada, source character set
+
+The GNAT compiler supports a number of character sets for source
+files.  @xref{Character Set Control, , Character Set Control,
+gnat_ugn}.  @value{GDBN} includes support for this as well.
+
+@table @code
+@item set ada source-charset @var{charset}
+@kindex set ada source-charset
+Set the source character set for Ada.  The character set must be
+supported by GNAT.  Because this setting affects the decoding of
+symbols coming from the debug information in your program, the setting
+should be set as early as possible.  The default is @code{ISO-8859-1},
+because that is also GNAT's default.
+
+@item show ada source-charset
+@kindex show ada source-charset
+Show the current source character set for Ada.
+@end table
+
@node Ada Glitches
@subsubsection Known Peculiarities of Ada Mode
@cindex Ada, problems
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-1.exp
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-1.exp
@ -0,0 +1,50 @@
+# Copyright 2022 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test UTF-8 identifiers.
+
+load_lib "ada.exp"
+
+if { [skip_ada_tests] } { return -1 }
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.  We
+# want this despite the program itself using Latin-1, as this test is
+# written using UTF-8.
+setenv LC_ALL C.UTF-8
+
+standard_ada_testfile prog
+
+set flags [list debug additional_flags=-gnati1]
+if {[gdb_compile_ada "${srcfile}" "${binfile}" executable $flags] != ""} {
+    return -1
+}
+
+# Restart without an executable so that we can set the encoding early.
+clean_restart
+
+# The default is Latin-1, but set this explicitly just in case we get
+# to change the default someday.
+gdb_test_no_output "set ada source-charset ISO-8859-1"
+
+gdb_load ${binfile}
+
+set bp_location [gdb_get_line_number "BREAK" ${testdir}/prog.adb]
+runto "prog.adb:$bp_location"
+
+gdb_test "print VAR_Þ" " = 23"
+gdb_test "print var_þ" " = 23"
+
+gdb_breakpoint "FUNC_Þ" message
+gdb_breakpoint "func_þ" message
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-1/pack.adb
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-1/pack.adb
@ -0,0 +1,28 @@
+--  Copyright 2022 Free Software Foundation, Inc. -*- coding: iso-latin-1 -*-
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+package body Pack is
+
+   function FUNC_Þ (x : Integer) return Integer is
+   begin
+      return x;
+   end FUNC_Þ;
+
+   procedure Do_Nothing (A : System.Address) is
+   begin
+      null;
+   end Do_Nothing;
+
+end Pack;
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-1/pack.ads
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-1/pack.ads
@ -0,0 +1,21 @@
+--  Copyright 2022 Free Software Foundation, Inc. -*- coding: iso-latin-1 -*-
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+with System;
+package Pack is
+   function FUNC_Þ (x : Integer) return Integer;
+
+   procedure Do_Nothing (A : System.Address);
+end Pack;
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-1/prog.adb
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-1/prog.adb
@ -0,0 +1,23 @@
+--  Copyright 2022 Free Software Foundation, Inc. -*- coding: iso-latin-1 -*-
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+with Pack; use Pack;
+
+procedure Prog is
+   -- This should be var_Ufe.
+   VAR_Þ : Integer := FUNC_Þ (23);
+begin
+   Do_Nothing (var_þ'Address); --  BREAK
+end Prog;
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-3.exp
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-3.exp
@ -0,0 +1,50 @@
+# Copyright 2022 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test UTF-8 identifiers.
+
+load_lib "ada.exp"
+
+if { [skip_ada_tests] } { return -1 }
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.  We
+# want this despite the program itself using Latin-1, as this test is
+# written using UTF-8.
+setenv LC_ALL C.UTF-8
+
+standard_ada_testfile prog
+
+set flags [list debug additional_flags=-gnati3]
+if {[gdb_compile_ada "${srcfile}" "${binfile}" executable $flags] != ""} {
+    return -1
+}
+
+# Restart without an executable so that we can set the encoding early.
+clean_restart
+
+gdb_test_no_output "set ada source-charset ISO-8859-3"
+
+gdb_load ${binfile}
+
+set bp_location [gdb_get_line_number "BREAK" ${testdir}/prog.adb]
+runto "prog.adb:$bp_location"
+
+gdb_test "print VAR_Ż" " = 23"
+gdb_test "print var_ż" " = 23"
+
+gdb_breakpoint "FUNC_Ż" message
+gdb_breakpoint "func_ż" message
+
+gdb_test "print var_𝕯" "warning: charset conversion failure.*"
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-3/pack.adb
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-3/pack.adb
@ -0,0 +1,28 @@
+--  Copyright 2022 Free Software Foundation, Inc. -*- coding: iso-latin-3 -*-
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+package body Pack is
+
+   function FUNC_¯ (x : Integer) return Integer is
+   begin
+      return x;
+   end FUNC_¯;
+
+   procedure Do_Nothing (A : System.Address) is
+   begin
+      null;
+   end Do_Nothing;
+
+end Pack;
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-3/pack.ads
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-3/pack.ads
@ -0,0 +1,21 @@
+--  Copyright 2022 Free Software Foundation, Inc. -*- coding: iso-latin-3 -*-
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+with System;
+package Pack is
+   function FUNC_¯ (x : Integer) return Integer;
+
+   procedure Do_Nothing (A : System.Address);
+end Pack;
--- a/gdb/testsuite/gdb.ada/non-ascii-latin-3/prog.adb
+++ b/gdb/testsuite/gdb.ada/non-ascii-latin-3/prog.adb
@ -0,0 +1,24 @@
+--  Copyright 2022 Free Software Foundation, Inc. -*- coding: iso-latin-3 -*-
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+with Pack; use Pack;
+
+procedure Prog is
+   -- The name is chosen to use a character that is not in Latin-1.
+   -- This should be var_Ubf.
+   VAR_¯ : Integer := FUNC_¯ (23);
+begin
+   Do_Nothing (var_¿'Address); --  BREAK
+end Prog;
--- a/gdb/testsuite/gdb.ada/non-ascii-utf-8.exp
+++ b/gdb/testsuite/gdb.ada/non-ascii-utf-8.exp
@ -0,0 +1,57 @@
+# Copyright 2022 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test UTF-8 identifiers.
+
+load_lib "ada.exp"
+
+if { [skip_ada_tests] } { return -1 }
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.
+setenv LC_ALL C.UTF-8
+
+standard_ada_testfile prog
+
+set flags [list debug additional_flags=-gnatW8]
+if {[gdb_compile_ada "${srcfile}" "${binfile}" executable $flags] != ""} {
+    return -1
+}
+
+# Restart without an executable so that we can set the encoding early.
+clean_restart
+
+gdb_test_no_output "set ada source-charset UTF-8"
+
+gdb_load ${binfile}
+
+set bp_location [gdb_get_line_number "BREAK" ${testdir}/prog.adb]
+runto "prog.adb:$bp_location"
+
+gdb_test "print VAR_Ü" " = 23"
+gdb_test "print var_ü" " = 23"
+gdb_test "print VAR_Ƹ" " = 24"
+gdb_test "print var_ƹ" " = 24"
+gdb_test "print VAR_𐐁" " = 25"
+gdb_test "print var_𐐩" " = 25"
+gdb_test "print VAR_Ż" " = 26"
+gdb_test "print var_ż" " = 26"
+
+gdb_breakpoint "FUNC_Ü" message
+gdb_breakpoint "func_ü" message
+gdb_breakpoint "FUNC_Ƹ" message
+gdb_breakpoint "func_ƹ" message
+gdb_breakpoint "FUNC_Ż" message
+gdb_breakpoint "func_ż" message
+gdb_breakpoint "FUNC_𐐁" message
--- a/gdb/testsuite/gdb.ada/non-ascii-utf-8/pack.adb
+++ b/gdb/testsuite/gdb.ada/non-ascii-utf-8/pack.adb
@ -0,0 +1,43 @@
+--  Copyright 2022 Free Software Foundation, Inc.
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+package body Pack is
+
+   function FUNC_Ü (x : Integer) return Integer is
+   begin
+      return x;
+   end FUNC_Ü;
+
+   function FUNC_Ƹ (x : Integer) return Integer is
+   begin
+      return x;
+   end FUNC_Ƹ;
+
+   function FUNC_𐐁 (x : Integer) return Integer is
+   begin
+      return x;
+   end FUNC_𐐁;
+
+   function FUNC_Ż (x : Integer) return Integer is
+   begin
+      return x;
+   end FUNC_Ż;
+
+   procedure Do_Nothing (A : System.Address) is
+   begin
+      null;
+   end Do_Nothing;
+
+end Pack;
--- a/gdb/testsuite/gdb.ada/non-ascii-utf-8/pack.ads
+++ b/gdb/testsuite/gdb.ada/non-ascii-utf-8/pack.ads
@ -0,0 +1,24 @@
+--  Copyright 2022 Free Software Foundation, Inc.
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+with System;
+package Pack is
+   function FUNC_Ü (x : Integer) return Integer;
+   function FUNC_Ƹ (x : Integer) return Integer;
+   function FUNC_𐐁 (x : Integer) return Integer;
+   function FUNC_Ż (x : Integer) return Integer;
+
+   procedure Do_Nothing (A : System.Address);
+end Pack;
--- a/gdb/testsuite/gdb.ada/non-ascii-utf-8/prog.adb
+++ b/gdb/testsuite/gdb.ada/non-ascii-utf-8/prog.adb
@ -0,0 +1,36 @@
+--  Copyright 2022 Free Software Foundation, Inc.
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+with Pack; use Pack;
+
+procedure Prog is
+   -- This should be var_Ufc.
+   VAR_Ü : Integer := FUNC_Ü (23);
+   -- This should be var_W01b8, because with UTF-8, non-ASCII
+   -- letters are upper-cased.
+   VAR_Ƹ : Integer := FUNC_Ƹ (24);
+   -- This should be var_WW00010401, because with UTF-8, non-ASCII
+   -- letters are upper-cased.
+   VAR_𐐁 : Integer := FUNC_𐐁 (25);
+   -- This is the same name as the corresponding Latin 3 test,
+   -- and helps show the peculiarity of the case folding rule.
+   -- This winds up as var_W017b, the upper-case variant.
+   VAR_Ż : Integer := FUNC_Ż (26);
+begin
+   Do_Nothing (var_ü'Address); --  BREAK
+   Do_Nothing (var_ƹ'Address);
+   Do_Nothing (var_𐐩'Address);
+   Do_Nothing (var_ż'Address);
+end Prog;