1+ //
2+ // HTMLToMarkdownConverter.swift
3+ // V2er
4+ //
5+ // Created by RichView on 2025/1/19.
6+ //
7+
8+ import Foundation
9+ import SwiftSoup
10+
11+ /// Converts HTML content to Markdown format
12+ public class HTMLToMarkdownConverter {
13+
14+ /// Configuration for crash behavior
15+ private let crashOnUnsupportedTags : Bool
16+
17+ /// Initialize converter
18+ public init ( crashOnUnsupportedTags: Bool = true ) {
19+ self . crashOnUnsupportedTags = crashOnUnsupportedTags
20+ }
21+
22+ /// Convert HTML string to Markdown
23+ public func convert( _ html: String ) throws -> String {
24+ // Pre-process V2EX specific URLs
25+ let preprocessedHTML = preprocessV2EXContent ( html)
26+
27+ // Parse HTML
28+ let doc = try SwiftSoup . parse ( preprocessedHTML)
29+ let body = doc. body ( ) ?? doc
30+
31+ // Convert to Markdown
32+ let markdown = try convertElement ( body)
33+
34+ // Clean up extra whitespace
35+ return cleanupMarkdown ( markdown)
36+ }
37+
38+ /// Pre-process V2EX specific content
39+ private func preprocessV2EXContent( _ html: String ) -> String {
40+ var processed = html
41+
42+ // Fix V2EX URLs that start with //
43+ processed = processed. replacingOccurrences (
44+ of: " href= \" // " ,
45+ with: " href= \" https:// "
46+ )
47+ processed = processed. replacingOccurrences (
48+ of: " src= \" // " ,
49+ with: " src= \" https:// "
50+ )
51+
52+ // Fix relative URLs
53+ processed = processed. replacingOccurrences (
54+ of: " href= \" / " ,
55+ with: " href= \" https://www.v2ex.com/ "
56+ )
57+
58+ return processed
59+ }
60+
61+ /// Convert HTML element to Markdown recursively
62+ private func convertElement( _ element: Element ) throws -> String {
63+ var result = " "
64+
65+ for node in element. getChildNodes ( ) {
66+ if let textNode = node as? TextNode {
67+ // Plain text
68+ result += escapeMarkdown ( textNode. text ( ) )
69+ } else if let childElement = node as? Element {
70+ // Process element based on tag
71+ let tagName = childElement. tagName ( ) . lowercased ( )
72+
73+ switch tagName {
74+ // Basic text formatting
75+ case " p " :
76+ let content = try convertElement ( childElement)
77+ result += " \n \n \( content) \n \n "
78+
79+ case " br " :
80+ result += " \n "
81+
82+ case " strong " , " b " :
83+ let content = try convertElement ( childElement)
84+ result += " ** \( content) ** "
85+
86+ case " em " , " i " :
87+ let content = try convertElement ( childElement)
88+ result += " * \( content) * "
89+
90+ case " a " :
91+ let text = try convertElement ( childElement)
92+ if let href = try ? childElement. attr ( " href " ) {
93+ result += " [ \( text) ]( \( href) ) "
94+ } else {
95+ result += text
96+ }
97+
98+ case " code " :
99+ // Check if this is inside a pre tag (handled separately)
100+ if childElement. parent ( ) ? . tagName ( ) . lowercased ( ) == " pre " {
101+ result += try childElement. text ( )
102+ } else {
103+ // Inline code
104+ let content = try childElement. text ( )
105+ result += " ` \( content) ` "
106+ }
107+
108+ case " pre " :
109+ // Code block
110+ let content = try childElement. text ( )
111+ let language = try ? childElement. attr ( " class " )
112+ . split ( separator: " " )
113+ . first ( where: { $0. hasPrefix ( " language- " ) } )
114+ . map { String ( $0. dropFirst ( " language- " . count) ) }
115+
116+ if let lang = language {
117+ result += " \n ``` \( lang) \n \( content) \n ``` \n "
118+ } else {
119+ result += " \n ``` \n \( content) \n ``` \n "
120+ }
121+
122+ case " blockquote " :
123+ let content = try convertElement ( childElement)
124+ let lines = content. split ( separator: " \n " )
125+ for line in lines {
126+ result += " > \( line) \n "
127+ }
128+ result += " \n "
129+
130+ case " ul " :
131+ result += try convertList ( childElement, ordered: false )
132+
133+ case " ol " :
134+ result += try convertList ( childElement, ordered: true )
135+
136+ case " li " :
137+ // Should be handled by ul/ol
138+ let content = try convertElement ( childElement)
139+ result += content
140+
141+ case " h1 " :
142+ let content = try convertElement ( childElement)
143+ result += " \n # \( content) \n "
144+
145+ case " h2 " :
146+ let content = try convertElement ( childElement)
147+ result += " \n ## \( content) \n "
148+
149+ case " h3 " :
150+ let content = try convertElement ( childElement)
151+ result += " \n ### \( content) \n "
152+
153+ case " h4 " :
154+ let content = try convertElement ( childElement)
155+ result += " \n #### \( content) \n "
156+
157+ case " h5 " :
158+ let content = try convertElement ( childElement)
159+ result += " \n ##### \( content) \n "
160+
161+ case " h6 " :
162+ let content = try convertElement ( childElement)
163+ result += " \n ###### \( content) \n "
164+
165+ case " img " :
166+ let alt = try ? childElement. attr ( " alt " )
167+ let src = try ? childElement. attr ( " src " )
168+ if let src = src {
169+ result += "  ) "
170+ }
171+
172+ case " hr " :
173+ result += " \n --- \n "
174+
175+ // Container elements - just process children
176+ case " div " , " span " , " body " , " html " :
177+ result += try convertElement ( childElement)
178+
179+ default :
180+ // Unsupported tag
181+ try RenderError . handleUnsupportedTag (
182+ tagName,
183+ context: String ( childElement. outerHtml ( ) . prefix ( 100 ) ) ,
184+ crashOnUnsupportedTags: crashOnUnsupportedTags
185+ )
186+
187+ // If we get here (didn't crash), just include the text content
188+ result += try convertElement ( childElement)
189+ }
190+ }
191+ }
192+
193+ return result
194+ }
195+
196+ /// Convert list to Markdown
197+ private func convertList( _ element: Element , ordered: Bool ) throws -> String {
198+ var result = " \n "
199+ let items = try element. select ( " li " )
200+
201+ for (index, item) in items. enumerated ( ) {
202+ let content = try convertElement ( item)
203+ if ordered {
204+ result += " \( index + 1 ) . \( content) \n "
205+ } else {
206+ result += " - \( content) \n "
207+ }
208+ }
209+
210+ result += " \n "
211+ return result
212+ }
213+
214+ /// Escape special Markdown characters
215+ private func escapeMarkdown( _ text: String ) -> String {
216+ // Only escape if not already in a code context
217+ // This is a simplified version - a full implementation would track context
218+ var escaped = text
219+
220+ // Don't escape inside code blocks (this is simplified)
221+ if !text. contains ( " ``` " ) && !text. contains ( " ` " ) {
222+ // Escape special Markdown characters
223+ let charactersToEscape = [ " \\ " , " * " , " _ " , " [ " , " ] " , " ( " , " ) " , " # " , " + " , " - " , " . " , " ! " ]
224+ for char in charactersToEscape {
225+ escaped = escaped. replacingOccurrences ( of: char, with: " \\ \( char) " )
226+ }
227+ }
228+
229+ return escaped
230+ }
231+
232+ /// Clean up Markdown output
233+ private func cleanupMarkdown( _ markdown: String ) -> String {
234+ var cleaned = markdown
235+
236+ // Remove excessive newlines (more than 2 consecutive)
237+ cleaned = cleaned. replacingOccurrences (
238+ of: #"\n{3,}"# ,
239+ with: " \n \n " ,
240+ options: . regularExpression
241+ )
242+
243+ // Trim whitespace
244+ cleaned = cleaned. trimmingCharacters ( in: . whitespacesAndNewlines)
245+
246+ return cleaned
247+ }
248+ }
0 commit comments