fw4spl
Encoding.cpp
1 /* ***** BEGIN LICENSE BLOCK *****
2  * FW4SPL - Copyright (C) IRCAD, 2009-2017.
3  * Distributed under the terms of the GNU Lesser General Public License (LGPL) as
4  * published by the Free Software Foundation.
5  * ****** END LICENSE BLOCK ****** */
6 
7 #include "fwGdcmIO/helper/Encoding.hpp"
8 
9 #include <fwCore/spyLog.hpp>
10 
11 #include <fwLog/Logger.hpp>
12 
13 #include <boost/algorithm/string/classification.hpp>
14 #include <boost/algorithm/string/split.hpp>
15 #include <boost/assign/list_of.hpp>
16 #include <boost/foreach.hpp>
17 #include <boost/locale/encoding.hpp>
18 
19 namespace fwGdcmIO
20 {
21 namespace helper
22 {
23 
24 const Encoding::DefinedTermToCharsetMapType Encoding::s_DEFINED_TERM_TO_CHARSET = ::boost::assign::map_list_of
25  // ASCII
26  ("ISO_IR 6", "")
27  ("ISO 2022 IR 6", "")
28 
29  // Latin alphabet No. 1
30  ("ISO_IR 100", "ISO-8859-1")
31  ("ISO 2022 IR 100", "ISO-8859-1")
32 
33  // Latin alphabet No. 2
34  ("ISO_IR 101", "ISO-8859-2")
35  ("ISO 2022 IR 101", "ISO-8859-2")
36 
37  // Latin alphabet No. 3
38  ("ISO_IR 109", "ISO-8859-3")
39  ("ISO 2022 IR 109", "ISO-8859-3")
40 
41  // Latin alphabet No. 4
42  ("ISO_IR 110", "ISO-8859-4")
43  ("ISO 2022 IR 110", "ISO-8859-4")
44 
45  // Cyrillic
46  ("ISO_IR 144", "ISO-8859-5")
47  ("ISO 2022 IR 144", "ISO-8859-5")
48 
49  // Arabic
50  ("ISO_IR 127", "ISO-8859-6")
51  ("ISO 2022 IR 127", "ISO-8859-6")
52 
53  // Greek
54  ("ISO_IR 126", "ISO-8859-7")
55  ("ISO 2022 IR 126", "ISO-8859-7")
56 
57  // Hebrew
58  ("ISO_IR 138", "ISO-8859-8")
59  ("ISO 2022 IR 138", "ISO-8859-8")
60 
61  // Latin alphabet No. 5
62  ("ISO_IR 148", "ISO-8859-9")
63  ("ISO 2022 IR 148", "ISO-8859-9")
64 
65  // Japanese (may require "ISO-IR-13")
66  ("ISO_IR 13", "JIS_X0201")
67  ("ISO 2022 IR 13", "JIS_X0201")
68 
69  // Thai
70  ("ISO_IR 166", "ISO-IR-166")
71  ("ISO 2022 IR 166", "ISO-IR-166")
72 
73  // Japanese
74  ("ISO 2022 IR 87", "ISO-IR-87")
75  ("ISO 2022 IR 159", "ISO-IR-159")
76 
77  // Korean
78  // - is this mapping really correct?
79  ("ISO 2022 IR 149", "EUC-KR")
80 
81  // Simplified Chinese
82  // - is this mapping really correct?
83  ("ISO 2022 IR 58", "GB2312")
84 
85  // Unicode in UTF-8 (multi-byte)
86  ("ISO_IR 192", "UTF-8")
87 
88  // Chinese (multi-byte)
89  ("GB18030", "GB18030")
90 
91  // Simplified Chinese (multi-byte)
92  ("GBK", "GBK")
93 ;
94 
95 const Encoding::EscapeSequenceToCharsetMapType Encoding::s_ESCAPE_SEQUENCE_TO_CHARSET = ::boost::assign::map_list_of
96  (std::make_pair(0x28,
97  0x42),
98  std::make_pair(
99  "ISO 2022 IR 6", "")) // ASCII
100  (std::make_pair(0x2d, 0x41),
101  std::make_pair(
102  "ISO 2022 IR 100",
103  "ISO-8859-1")) // Latin
104  // alphabet
105  // No. 1
106  (std::make_pair(0x2d, 0x42),
107  std::make_pair(
108  "ISO 2022 IR 101",
109  "ISO-8859-2")) // Latin
110  // alphabet
111  // No. 2
112  (std::make_pair(0x2d, 0x43),
113  std::make_pair(
114  "ISO 2022 IR 109",
115  "ISO-8859-3")) // Latin
116  // alphabet
117  // No. 3
118  (std::make_pair(0x2d, 0x44),
119  std::make_pair(
120  "ISO 2022 IR 110",
121  "ISO-8859-4")) // Latin
122  // alphabet
123  // No. 4
124  (std::make_pair(0x2d, 0x4c),
125  std::make_pair(
126  "ISO 2022 IR 144",
127  "ISO-8859-5")) // Cyrillic
128  (std::make_pair(0x2d, 0x47),
129  std::make_pair(
130  "ISO 2022 IR 127",
131  "ISO-8859-6")) // Arabic
132  (std::make_pair(0x2d, 0x46),
133  std::make_pair(
134  "ISO 2022 IR 126",
135  "ISO-8859-7")) // Greek
136  (std::make_pair(0x2d, 0x48),
137  std::make_pair(
138  "ISO 2022 IR 138",
139  "ISO-8859-8")) // Hebrew
140  (std::make_pair(0x2d, 0x4d),
141  std::make_pair(
142  "ISO 2022 IR 148",
143  "ISO-8859-9")) // Latin
144  // alphabet
145  // No. 5
146  (std::make_pair(0x29, 0x49),
147  std::make_pair(
148  "ISO 2022 IR 13",
149  "JIS_X0201")) // Japanese
150  // (may
151  // require
152  // "ISO-IR-13")
153  (std::make_pair(0x28, 0x4a),
154  std::make_pair(
155  "ISO 2022 IR 13",
156  "ISO-IR-14")) // Japanese
157  (std::make_pair(0x2d, 0x54),
158  std::make_pair(
159  "ISO 2022 IR 166",
160  "ISO-IR-166")) // Thai
161  (std::make_pair(0x24, 0x42),
162  std::make_pair(
163  "ISO 2022 IR 87",
164  "ISO-IR-87")) // Japanese
165  // (multi-byte)
166 ;
167 
168 //------------------------------------------------------------------------------
169 
170 std::string Encoding::convertString(const std::string& source,
171  const std::string& definedCharsetTerm,
172  const ::fwLog::Logger::sptr& logger)
173 throw(::fwCore::Exception, ::boost::locale::conv::invalid_charset_error)
174 {
175  if(source.empty())
176  {
177  return "";
178  }
179 
180  // Retrieve DICOM Specific Character Set List
181  std::vector<std::string> definedTermList;
182  ::boost::split(definedTermList, definedCharsetTerm, ::boost::is_any_of("\\"));
183 
184  // Only one charset without code extension techniques is used
185  if(definedCharsetTerm.empty() || definedTermList.size() == 1)
186  {
187  return convertStringWithoutCodeExtensions(source, definedCharsetTerm, logger);
188  }
189  // Several charsets with code extension techniques are used
190  else
191  {
192  // If the attribute Specific Character Set (0008,0005) has more than one value
193  // and value 1 is empty, it is assumed that value 1 is ISO 2022 IR 6.
194  if(definedTermList[0].empty())
195  {
196  definedTermList[0] = "ISO 2022 IR 6";
197  }
198 
199  // Check for characters ESC delimiter
200  std::vector<std::string> sequenceList;
201  ::boost::split(sequenceList, source, ::boost::is_any_of("\033"));
202 
203  std::string result;
204 
205  // Add the first part
206  if(source[0] != '\033')
207  {
208  result += convertStringWithoutCodeExtensions(sequenceList[0], definedTermList[0], logger);
209  }
210  else
211  {
212  result += Encoding::convertSequenceWithCodeExtensions(sequenceList[0], definedTermList, logger);
213  }
214 
215  // Convert remaining sequences according to specific charsets
216  std::vector<std::string>::iterator it = ++sequenceList.begin();
217  for(; it != sequenceList.end(); ++it)
218  {
219  result += convertSequenceWithCodeExtensions(*it, definedTermList, logger);
220  }
221 
222  return result;
223  }
224 }
225 
226 //------------------------------------------------------------------------------
227 
228 std::string Encoding::convertStringWithoutCodeExtensions(const std::string& source,
229  const std::string& definedTerm,
230  const ::fwLog::Logger::sptr& logger)
231 {
232  std::string charset;
233 
234  if (definedTerm.empty()) // assuming ASCII (according to DICOM PS 3.5)
235  {
236  charset = "";
237  }
238  else
239  {
240  SLM_WARN_IF("'ISO_IR 6' is not a defined term in DICOM, will be treated as an empty value (ASCII)",
241  definedTerm == "ISO_IR 6");
242 
243  // Check that the defined term is known
244  if(s_DEFINED_TERM_TO_CHARSET.find(definedTerm) != s_DEFINED_TERM_TO_CHARSET.end())
245  {
246  charset = s_DEFINED_TERM_TO_CHARSET.at(definedTerm);
247  }
248  else
249  {
250  const std::string msg = "'"+definedTerm+"' is not a defined term in DICOM, "
251  "will be treated as an empty value (ASCII)";
252 
253  SLM_WARN_IF(msg, !logger);
254  if(logger)
255  {
256  logger->warning(msg);
257  }
258 
259  charset = "";
260  }
261  }
262 
263  // Empty value treated as ASCII
264  if(charset.empty())
265  {
266  return source;
267  }
268  else
269  {
270  return ::boost::locale::conv::to_utf<char>(source, charset);
271  }
272 
273 }
274 
275 //------------------------------------------------------------------------------
276 
277 void checkDefinedTermDeclaration(const std::string& definedTerm,
278  const std::vector<std::string>& definedTermList,
279  const ::fwLog::Logger::sptr& logger)
280 {
281  if(std::find(definedTermList.begin(), definedTermList.end(), definedTerm) == definedTermList.end())
282  {
283  const std::string msg = "Escape sequence refers to character set '" + definedTerm
284  + "' that was not declared in SpecificCharacterSet (0008,0005).";
285 
286  SLM_WARN_IF(msg, !logger);
287  if(logger)
288  {
289  logger->warning(msg);
290  }
291  }
292 }
293 
294 //------------------------------------------------------------------------------
295 
296 std::string Encoding::convertSequenceWithCodeExtensions(const std::string& sequence,
297  const std::vector<std::string>& definedTermList,
298  const ::fwLog::Logger::sptr& logger)
299 throw(::fwCore::Exception, ::boost::locale::conv::invalid_charset_error)
300 {
301  // We need at least two more characters to determine the new character set
302  FW_RAISE_IF("Cannot convert character set: Incomplete escape sequence.", sequence.size() < 2);
303 
304  const char c1 = sequence[0];
305  const char c2 = sequence[1];
306 
307  unsigned short escapeSize = 2;
308 
309  EscapeSequenceType escapeSequence = std::make_pair(c1, c2);
310  DefinedTermAndCharsetPairType definedTermAndCharset = std::make_pair("", "");
311 
312  if(s_ESCAPE_SEQUENCE_TO_CHARSET.find(escapeSequence) != s_ESCAPE_SEQUENCE_TO_CHARSET.end())
313  {
314  definedTermAndCharset = s_ESCAPE_SEQUENCE_TO_CHARSET.at(escapeSequence);
315  }
316  else if ((c1 == 0x24) && (c2 == 0x28)) // Japanese (multi-byte)
317  {
318  // Do we still have another character in the string?
319  if(sequence.size() >= 3)
320  {
321  escapeSize = 3;
322  if (sequence[2] == 0x44)
323  {
324  definedTermAndCharset = std::make_pair("ISO 2022 IR 159", "ISO-IR-159");
325  }
326  }
327  }
328  else if ((c1 == 0x24) && (c2 == 0x29))
329  {
330  // Do we still have another character in the string?
331  if(sequence.size() >= 3)
332  {
333  escapeSize = 3;
334  if (sequence[2] == 0x43) // Korean (multi-byte)
335  {
336  // - is this mapping really correct?
337  definedTermAndCharset = std::make_pair("ISO 2022 IR 149", "EUC-KR");
338  }
339  else if (sequence[2] == 0x41) // Simplified Chinese (multi-byte)
340  {
341  // - is this mapping really correct?
342  definedTermAndCharset = std::make_pair("ISO 2022 IR 58", "GB2312");
343  }
344  }
345  }
346 
347  // Check that a definedTerm has been found
348  FW_RAISE_IF("Unable to retrieve character set from escape sequence.", definedTermAndCharset.first.empty());
349 
350  // Check that the defined term has been declared in SpecificCharacterSet (0008,0005)
351  checkDefinedTermDeclaration(definedTermAndCharset.first, definedTermList, logger);
352 
353  // Empty value treated as ASCII
354  if(definedTermAndCharset.second.empty())
355  {
356  return sequence.substr(escapeSize);
357  }
358  else
359  {
360  return ::boost::locale::conv::to_utf<char>(sequence.substr(escapeSize), definedTermAndCharset.second);
361  }
362 }
363 
364 //------------------------------------------------------------------------------
365 
366 } //namespace helper
367 } //namespace fwGdcmIO
368 
The namespace fwGdcmIO contains reader, writer and helper for dicom data.
This file defines SpyLog macros. These macros are used to log messages to a file or to the console du...
static FWGDCMIO_API std::string convertString(const std::string &source, const std::string &definedCharsetTerm, const std::shared_ptr< ::fwLog::Logger > &logger=nullptr)
Convert a DICOM string from the specified charset to utf-8.
Definition: Encoding.cpp:170
#define SLM_WARN_IF(message, cond)
Definition: spyLog.hpp:265