如何在swift中解码HTML实体?

我拉从一个网站的JSON文件,其中一个收到的string是:

The Weeknd ‘King Of The Fall’ [Video Premiere] | @TheWeeknd | #SoPhi 

我怎样才能将&#8216这样的东西转换成正确的字符?

我已经做了一个Xcode游乐场来演示它:

 import UIKit var error: NSError? let blogUrl: NSURL = NSURL.URLWithString("http://sophisticatedignorance.net/api/get_recent_summary/") let jsonData = NSData(contentsOfURL: blogUrl) let dataDictionary = NSJSONSerialization.JSONObjectWithData(jsonData, options: nil, error: &error) as NSDictionary var a = dataDictionary["posts"] as NSArray println(a[0]["title"]) 

有没有简单的方法来做到这一点,但你可以使用NSAttributedString魔术,使这个过程尽可能无痛(警告这种方法将剥离所有的HTML标签):

 let encodedString = "The Weeknd <em>&#8216;King Of The Fall&#8217;</em>" // encodedString should = a[0]["title"] in your case guard let data = htmlEncodedString.data(using: .utf8) else { return nil } let options: [String: Any] = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: String.Encoding.utf8.rawValue ] guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else { return nil } let decodedString = attributedString.string // The Weeknd 'King Of The Fall' 

请记住只从主线程初始化NSAttributedString 。 它在底下使用了一些WebKit魔法,因此是要求。


您可以创build自己的String扩展来增加可重用性:

 extension String { init?(htmlEncodedString: String) { guard let data = htmlEncodedString.data(using: .utf8) else { return nil } let options: [String: Any] = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: String.Encoding.utf8.rawValue ] guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else { return nil } self.init(attributedString.string) } } let encodedString = "The Weeknd <em>&#8216;King Of The Fall&#8217;</em>" let decodedString = String(htmlEncodedString: encodedString) 

@ akashivskyy的答案是伟大的,演示如何利用NSAttributedString来解码HTML实体。 一个可能的缺点(如他所述)是所有的 HTML标记也被删除,所以

 <strong> 4 &lt; 5 &amp; 3 &gt; 2</strong> 

 4 < 5 & 3 > 2 

在OS X上有CFXMLCreateStringByUnescapingEntities()这个工作:

 let encoded = "<strong> 4 &lt; 5 &amp; 3 &gt; 2 .</strong> Price: 12 &#x20ac;. &#64; " let decoded = CFXMLCreateStringByUnescapingEntities(nil, encoded, nil) as String println(decoded) // <strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @ 

但是这在iOS上不可用。

这是一个纯粹的Swift实现。 它解码字符实体引用,如&lt; 使用字典以及所有数字字符实体,如&#64&#x20ac 。 (请注意,我没有明确列出所有的252个HTML实体。)

Swift 2:

 // Mapping from XML/HTML character entity reference to character // From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references private let characterEntities : [ String : Character ] = [ // XML predefined entities: "&quot;" : "\"", "&amp;" : "&", "&apos;" : "'", "&lt;" : "<", "&gt;" : ">", // HTML character entity references: "&nbsp;" : "\u{00a0}", // ... "&diams;" : "♦", ] extension String { /// Returns a new string made by replacing in the `String` /// all HTML character entity references with the corresponding /// character. var stringByDecodingHTMLEntities : String { // ===== Utility functions ===== // Convert the number in the string to the corresponding // Unicode character, eg // decodeNumeric("64", 10) --> "@" // decodeNumeric("20ac", 16) --> "€" func decodeNumeric(string : String, base : Int32) -> Character? { let code = UInt32(strtoul(string, nil, base)) return Character(UnicodeScalar(code)) } // Decode the HTML character entity to the corresponding // Unicode character, return `nil` for invalid input. // decode("&#64;") --> "@" // decode("&#x20ac;") --> "€" // decode("&lt;") --> "<" // decode("&foo;") --> nil func decode(entity : String) -> Character? { if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){ return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(3)), base: 16) } else if entity.hasPrefix("&#") { return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(2)), base: 10) } else { return characterEntities[entity] } } // ===== Method starts here ===== var result = "" var position = startIndex // Find the next '&' and copy the characters preceding it to `result`: while let ampRange = self.rangeOfString("&", range: position ..< endIndex) { result.appendContentsOf(self[position ..< ampRange.startIndex]) position = ampRange.startIndex // Find the next ';' and copy everything from '&' to ';' into `entity` if let semiRange = self.rangeOfString(";", range: position ..< endIndex) { let entity = self[position ..< semiRange.endIndex] position = semiRange.endIndex if let decoded = decode(entity) { // Replace by decoded character: result.append(decoded) } else { // Invalid entity, copy verbatim: result.appendContentsOf(entity) } } else { // No matching ';'. break } } // Copy remaining characters to `result`: result.appendContentsOf(self[position ..< endIndex]) return result } } 

例:

 let encoded = "<strong> 4 &lt; 5 &amp; 3 &gt; 2 .</strong> Price: 12 &#x20ac;. &#64; " let decoded = encoded.stringByDecodingHTMLEntities print(decoded) // <strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @ 

Swift 3:

 // Mapping from XML/HTML character entity reference to character // From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references private let characterEntities : [ String : Character ] = [ // XML predefined entities: "&quot;" : "\"", "&amp;" : "&", "&apos;" : "'", "&lt;" : "<", "&gt;" : ">", // HTML character entity references: "&nbsp;" : "\u{00a0}", // ... "&diams;" : "♦", ] extension String { /// Returns a new string made by replacing in the `String` /// all HTML character entity references with the corresponding /// character. var stringByDecodingHTMLEntities : String { // ===== Utility functions ===== // Convert the number in the string to the corresponding // Unicode character, eg // decodeNumeric("64", 10) --> "@" // decodeNumeric("20ac", 16) --> "€" func decodeNumeric(_ string : String, base : Int) -> Character? { guard let code = UInt32(string, radix: base), let uniScalar = UnicodeScalar(code) else { return nil } return Character(uniScalar) } // Decode the HTML character entity to the corresponding // Unicode character, return `nil` for invalid input. // decode("&#64;") --> "@" // decode("&#x20ac;") --> "€" // decode("&lt;") --> "<" // decode("&foo;") --> nil func decode(_ entity : String) -> Character? { if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){ return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16) } else if entity.hasPrefix("&#") { return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10) } else { return characterEntities[entity] } } // ===== Method starts here ===== var result = "" var position = startIndex // Find the next '&' and copy the characters preceding it to `result`: while let ampRange = self.range(of: "&", range: position ..< endIndex) { result.append(self[position ..< ampRange.lowerBound]) position = ampRange.lowerBound // Find the next ';' and copy everything from '&' to ';' into `entity` if let semiRange = self.range(of: ";", range: position ..< endIndex) { let entity = self[position ..< semiRange.upperBound] position = semiRange.upperBound if let decoded = decode(entity) { // Replace by decoded character: result.append(decoded) } else { // Invalid entity, copy verbatim: result.append(entity) } } else { // No matching ';'. break } } // Copy remaining characters to `result`: result.append(self[position ..< endIndex]) return result } } 

Swift 3版的@akashivskyy的扩展 ,

 extension String { init(htmlEncodedString: String) { self.init() guard let encodedData = htmlEncodedString.data(using: .utf8) else { self = htmlEncodedString return } let attributedOptions: [String : Any] = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: String.Encoding.utf8.rawValue ] do { let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil) self = attributedString.string } catch { print("Error: \(error)") self = htmlEncodedString } } } 

Swift 2版本的@ akashivskyy的扩展,

  extension String { init(htmlEncodedString: String) { if let encodedData = htmlEncodedString.dataUsingEncoding(NSUTF8StringEncoding){ let attributedOptions : [String: AnyObject] = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: NSUTF8StringEncoding ] do{ if let attributedString:NSAttributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil){ self.init(attributedString.string) }else{ print("error") self.init(htmlEncodedString) //Returning actual string if there is an error } }catch{ print("error: \(error)") self.init(htmlEncodedString) //Returning actual string if there is an error } }else{ self.init(htmlEncodedString) //Returning actual string if there is an error } } } 
 extension String{ func decodeEnt() -> String{ let encodedData = self.dataUsingEncoding(NSUTF8StringEncoding)! let attributedOptions : [String: AnyObject] = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: NSUTF8StringEncoding ] let attributedString = NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil, error: nil)! return attributedString.string } } let encodedString = "The Weeknd &#8216;King Of The Fall&#8217;" let foo = encodedString.decodeEnt() // The Weeknd 'King Of The Fall' 

Swift 4版本

 extension String { init(htmlEncodedString: String) { self.init() guard let encodedData = htmlEncodedString.data(using: .utf8) else { self = htmlEncodedString return } let attributedOptions: [NSAttributedString.DocumentReadingOptionKey : Any] = [ NSAttributedString.DocumentReadingOptionKey(rawValue: NSAttributedString.DocumentAttributeKey.documentType.rawValue): NSAttributedString.DocumentType.html, NSAttributedString.DocumentReadingOptionKey(rawValue: NSAttributedString.DocumentAttributeKey.characterEncoding.rawValue): String.Encoding.utf8.rawValue ] do { let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil) self = attributedString.string } catch { print("Error: \(error)") self = htmlEncodedString } } } 

我正在寻找一个纯粹的Swift3.0实用程序来从HTML字符引用(即在MacOS和Linux上的服务器端Swift应用程序)转义到/ unescape,但没有find任何全面的解决scheme,所以我编写了自己的实现: https: //github.com/IBM-Swift/swift-html-entities

HTMLEntities包与HTML4命名的字符引用以及hex/十进制数字字符引用一起工作,它将识别W3 HTML5规范中的特殊数字字符引用(即&#x80;应该作为欧元符号(unicode U+20AC ),而不是U+0080的Unicode字符,并且某些范围的数字字符引用在使用时应replace为replace字符U+FFFD )。

用法示例:

 import HTMLEntities // encode example let html = "<script>alert(\"abc\")</script>" print(html.htmlEscape()) // Prints ”&lt;script&gt;alert(&quot;abc&quot;)&lt;/script&gt;" // decode example let htmlencoded = "&lt;script&gt;alert(&quot;abc&quot;)&lt;/script&gt;" print(htmlencoded.htmlUnescape()) // Prints ”<script>alert(\"abc\")</script>" 

对于OP的例子:

 print("The Weeknd &#8216;King Of The Fall&#8217; [Video Premiere] | @TheWeeknd | #SoPhi ".htmlUnescape()) // prints "The Weeknd 'King Of The Fall' [Video Premiere] | @TheWeeknd | #SoPhi " 

编辑: HTMLEntities现在支持2.0.0版本的HTML5命名字符引用。 符合规范的parsing也被实现。

@yishus计算的var版本的答案

 public extension String { /// Decodes string with html encoding. var htmlDecoded: String { guard let encodedData = self.data(using: .utf8) else { return self } let attributedOptions: [String : Any] = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: String.Encoding.utf8.rawValue] do { let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil) return attributedString.string } catch { print("Error: \(error)") return self } } } 

这将是我的做法。 你可以添加从https://gist.github.com/mwaterfall/25b4a6a06dc3309d9555迈克尔瀑布提到的实体字典。;

 extension String { func htmlDecoded()->String { guard (self != "") else { return self } var newStr = self let entities = [ "&quot;" : "\"", "&amp;" : "&", "&apos;" : "'", "&lt;" : "<", "&gt;" : ">", ] for (name,value) in entities { newStr = newStr.stringByReplacingOccurrencesOfString(name, withString: value) } return newStr } } 

使用的例子:

 let encoded = "this is so &quot;good&quot;" let decoded = encoded.htmlDecoded() // "this is so "good"" 

要么

 let encoded = "this is so &quot;good&quot;".htmlDecoded() // "this is so "good"" 

更新了Swift 3上的答案

  extension String { init?(htmlEncodedString: String) { let encodedData = htmlEncodedString.data(using: String.Encoding.utf8)! let attributedOptions = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType] guard let attributedString = try? NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil) else { return nil } self.init(attributedString.string) } 

Swift 3.0版本与实际的字体大小转换

通常,如果直接将html转换为属性string,则字体大小会增加。 您可以尝试将htmlstring转换为属性string,然后再返回以查看区别。

相反,这是通过在所有字体上应用0.75比率来确保字体大小不变的实际大小转换

 extension String { func htmlAttributedString() -> NSAttributedString? { guard let data = self.data(using: String.Encoding.utf16, allowLossyConversion: false) else { return nil } guard let attriStr = try? NSMutableAttributedString( data: data, options: [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType], documentAttributes: nil) else { return nil } attriStr.beginEditing() attriStr.enumerateAttribute(NSFontAttributeName, in: NSMakeRange(0, attriStr.length), options: .init(rawValue: 0)) { (value, range, stop) in if let font = value as? UIFont { let resizedFont = font.withSize(font.pointSize * 0.75) attriStr.addAttribute(NSFontAttributeName, value: resizedFont, range: range) } } attriStr.endEditing() return attriStr } } 

斯威夫特4

 extension String { var replacingHTMLEntities: String? { do { return try NSAttributedString(data: Data(utf8), options: [ .documentType: NSAttributedString.DocumentType.html, .characterEncoding: String.Encoding.utf8.rawValue ], documentAttributes: nil).string } catch { return nil } } } 

简单的用法

 let clean = "string".replacingHTMLEntities! 

SWIFT 4

 extension String { mutating func toHtmlEncodedString() { guard let encodedData = self.data(using: .utf8) else { return } let attributedOptions: [NSAttributedString.DocumentReadingOptionKey : Any] = [ NSAttributedString.DocumentReadingOptionKey(rawValue: NSAttributedString.DocumentAttributeKey.documentType.rawValue): NSAttributedString.DocumentType.html, NSAttributedString.DocumentReadingOptionKey(rawValue: NSAttributedString.DocumentAttributeKey.characterEncoding.rawValue): String.Encoding.utf8.rawValue ] do { let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil) self = attributedString.string } catch { print("Error: \(error)") } } 

NSData dataRes =(nsdata值)

var resString = NSString(data:dataRes,encoding:NSUTF8StringEncoding)