IT技术 - Javascript：Unicode 字符串到十六进制 - 吾爱随笔录

Javascript：Unicode 字符串到十六进制

IT技术 javascript unicode utf-8 hex

2021-02-04 13:28:37

我正在尝试将 unicode 字符串转换为 javascript 中的十六进制表示。

这就是我所拥有的：

function convertFromHex(hex) {
    var hex = hex.toString();//force conversion
    var str = '';
    for (var i = 0; i < hex.length; i += 2)
        str += String.fromCharCode(parseInt(hex.substr(i, 2), 16));
    return str;
}

function convertToHex(str) {
    var hex = '';
    for(var i=0;i<str.length;i++) {
        hex += ''+str.charCodeAt(i).toString(16);
    }
    return hex;
}

但是如果在 unicode 字符上失败，比如中文；

输入：汉字

输出： ªo"[W

有任何想法吗？这可以在javascript中完成吗？

6个回答

请记住，JavaScript 代码单元的宽度为 16 位。因此，十六进制字符串形式将为每个代码单元 4 位数字。

用法：

var str = "\u6f22\u5b57"; // "\u6f22\u5b57" === "漢字"
alert(str.hexEncode().hexDecode());

字符串到十六进制形式：

String.prototype.hexEncode = function(){
    var hex, i;

    var result = "";
    for (i=0; i<this.length; i++) {
        hex = this.charCodeAt(i).toString(16);
        result += ("000"+hex).slice(-4);
    }

    return result
}

再次回来：

String.prototype.hexDecode = function(){
    var j;
    var hexes = this.match(/.{1,4}/g) || [];
    var back = "";
    for(j = 0; j<hexes.length; j++) {
        back += String.fromCharCode(parseInt(hexes[j], 16));
    }

    return back;
}

"\u6f22\u5b57"是文字的 Unicode 转义形式，其"漢字"方式\n与换行符相同。我倾向于使用它们来避免歧义并避免字符编码问题。有关详细信息，请参阅规范。要自己生成它们，请将上述内容更改("000"+hex).slice(-4)为"\\u" + ("000"+hex).slice(-4). 该表达式的"\u6f22\u5b57" === "漢字"计算结果为真，因为在代码解析后它们是相同的。

2021-03-25 13:28:37

谢谢，虽然只有 1 个问题（可能是一个愚蠢的问题..）——你如何从 javascript 中的汉字中得到 \u6f22\u5b57 ？最接近的是 escape() 函数，但它使用 % - 我猜可以使用某种正则表达式将 % 替换为 / - 但 escape() 函数也已弃用。EncodeURI 和 encodeURIComponent 都给出不同的输出。任何的想法？

2021-03-30 13:28:37

谢谢，我遇到了 1 个问题，有时是 hex.match(//.{1,4}/g); 不匹配任何东西。（错误：null 不是对象（评估 hexes.length）） - 你知道可能是什么原因吗？

2021-04-02 13:28:37

如果您使用顶级算法作为书面“测试”编码为"0074006500730074". 没有 ASCII。JavaScript 字符串始终为UTF-16。

2021-04-03 13:28:37

我修复了 hexDecode 函数，因为它似乎不起作用；

var a = "\\x73\\x75\\x62\\x73\\x74\\x72"; var str = "\\u6f22\\u5b57"; String.prototype.hexDecode = function(){ var j; var hexes = this.split("\\"); var back = ""; for(j = 1; j<hexes.length; j++) { var xhex = hexes[j]; var hex = xhex.slice(1); back += String.fromCharCode(parseInt(hex, 16)); } return back; }; a.hexDecode(); //"substr" str.hexDecode(); //"漢字"

这也适用于十六进制转义序列

2021-04-12 13:28:37

这是对 McDowell 算法的一个不填充结果的调整：

  function toHex(str) {
    var result = '';
    for (var i=0; i<str.length; i++) {
      result += str.charCodeAt(i).toString(16);
    }
    return result;
  }

例如，如果您需要对字符串或类似的东西进行十六进制编码

2021-03-27 13:28:37

荣誉@redgeoff！此解决方案在将字符串传递到 PHP 并使用hex2bin().

2021-04-03 13:28:37

不确定我在看什么，但这对我获取用户的私人CouchDB数据库很有用！谢谢

2021-04-09 13:28:37

你为什么不想垫？现在十六进制输出不明确

2021-04-10 13:28:37

这取决于您使用的编码。如果要将 utf-8 编码的十六进制转换为字符串，请使用以下命令：

function fromHex(hex,str){
  try{
    str = decodeURIComponent(hex.replace(/(..)/g,'%$1'))
  }
  catch(e){
    str = hex
    console.log('invalid hex input: ' + hex)
  }
  return str
}

对于另一个方向，请使用：

function toHex(str,hex){
  try{
    hex = unescape(encodeURIComponent(str))
    .split('').map(function(v){
      return v.charCodeAt(0).toString(16)
    }).join('')
  }
  catch(e){
    hex = str
    console.log('invalid text input: ' + str)
  }
  return hex
}

您可以将其更改为返回 v.charCodeAt(0).toString(16).padStart(2, '0')

2021-03-14 13:28:37

对于 toHex 函数，如果 hex < 10，它需要 '0' 填充.. 如果 \n 或 \t 出现在文本中，它将显示为 '9' 或 'a' .. 但它应该是 '09' 和'0a' 分别。

2021-03-21 13:28:37

一个更新的解决方案，用于编码：

// This is the same for all of the below, and
// you probably won't need it except for debugging
// in most cases.
function bytesToHex(bytes) {
  return Array.from(
    bytes,
    byte => byte.toString(16).padStart(2, "0")
  ).join("");
}

// You almost certainly want UTF-8, which is
// now natively supported:
function stringToUTF8Bytes(string) {
  return new TextEncoder().encode(string);
}

// But you might want UTF-16 for some reason.
// .charCodeAt(index) will return the underlying
// UTF-16 code-units (not code-points!), so you
// just need to format them in whichever endian order you want.
function stringToUTF16Bytes(string, littleEndian) {
  const bytes = new Uint8Array(string.length * 2);
  // Using DataView is the only way to get a specific
  // endianness.
  const view = new DataView(bytes.buffer);
  for (let i = 0; i != string.length; i++) {
    view.setUint16(i, string.charCodeAt(i), littleEndian);
  }
  return bytes;
}

// And you might want UTF-32 in even weirder cases.
// Fortunately, iterating a string gives the code
// points, which are identical to the UTF-32 encoding,
// though you still have the endianess issue.
function stringToUTF32Bytes(string, littleEndian) {
  const codepoints = Array.from(string, c => c.codePointAt(0));
  const bytes = new Uint8Array(codepoints.length * 4);
  // Using DataView is the only way to get a specific
  // endianness.
  const view = new DataView(bytes.buffer);
  for (let i = 0; i != codepoints.length; i++) {
    view.setUint32(i, codepoints[i], littleEndian);
  }
  return bytes;
}

例子：

bytesToHex(stringToUTF8Bytes("hello 漢字 👍"))
// "68656c6c6f20e6bca2e5ad9720f09f918d"
bytesToHex(stringToUTF16Bytes("hello 漢字 👍", false))
// "00680065006c006c006f00206f225b570020d83ddc4d"
bytesToHex(stringToUTF16Bytes("hello 漢字 👍", true))
// "680065006c006c006f002000226f575b20003dd84ddc"
bytesToHex(stringToUTF32Bytes("hello 漢字 👍", false))
// "00000068000000650000006c0000006c0000006f0000002000006f2200005b57000000200001f44d"
bytesToHex(stringToUTF32Bytes("hello 漢字 👍", true))
// "68000000650000006c0000006c0000006f00000020000000226f0000575b0000200000004df40100"

对于解码，通常要简单得多，您只需要：

function hexToBytes(hex) {
    const bytes = new Uint8Array(hex.length / 2);
    for (let i = 0; i !== bytes.length; i++) {
        bytes[i] = parseInt(hex.substr(i * 2, 2), 16);
    }
    return bytes;
}

然后使用的编码参数TextDecoder：

// UTF-8 is default
new TextDecoder().decode(hexToBytes("68656c6c6f20e6bca2e5ad9720f09f918d"));
// but you can also use:
new TextDecoder("UTF-16LE").decode(hexToBytes("680065006c006c006f002000226f575b20003dd84ddc"))
new TextDecoder("UTF-16BE").decode(hexToBytes("00680065006c006c006f00206f225b570020d83ddc4d"));
// "hello 漢字 👍"

以下是允许的编码名称列表：https : //www.w3.org/TR/encoding/#names-and-labels

您可能会注意到 UTF-32 不在该列表中，这很痛苦，因此：

function bytesToStringUTF32(bytes, littleEndian) {
  const view = new DataView(bytes.buffer);
  const codepoints = new Uint32Array(view.byteLength / 4);
  for (let i = 0; i !== codepoints.length; i++) {
    codepoints[i] = view.getUint32(i * 4, littleEndian);
  }
  return String.fromCodePoint(...codepoints);
}

然后：

bytesToStringUTF32(hexToBytes("00000068000000650000006c0000006c0000006f0000002000006f2200005b57000000200001f44d"), false)
bytesToStringUTF32(hexToBytes("68000000650000006c0000006c0000006f00000020000000226f0000575b0000200000004df40100"), true)
// "hello 漢字 👍"

你是如何"\u6f22\u5b57"从漢字JavaScript 中获取的？

这些是JavaScript Unicode 转义序列，例如\u12AB. 要转换它们，您可以遍历字符串中的每个代码单元，调用.toString(16)它，然后从那里开始。

然而，尽可能在输出中使用十六进制转义序列\xAA会更有效。

还要注意的是ASCII符号，如A，b和-可能不需要进行转义。

我编写了一个小型 JavaScript 库来为您完成所有这些工作，名为jsesc. 它有很多选项来控制输出。

这是该工具的在线演示：http : //mothereff.in/js-escapes#1%E6%BC%A2%E5%AD%97

您的问题被标记为utf-8。阅读您的其余问题，UTF-8 编码/解码似乎不是您想要的，但如果您需要它：使用utf8.js（在线演示）。

其它你可能感兴趣的问题

上一篇根据属性“数据排序”对 jQuery 中的 div 进行排序？下一篇在 Meteor.js 中使用多个 Mongodb 数据库