fw4spl
build_docset.py
1 #!/bin/python3
2 """
3 Generate a dash docset from a doxygen documentation.
4 """
5 
6 import json
7 import os
8 from pathlib import Path
9 import re
10 import shutil
11 import sqlite3
12 import sys
13 
14 import bs4
15 
16 CFG = dict()
17 REPO_NAMES = dict()
18 
19 # Global regexes we don't want to recompile every single time we parse a file
20 CLASS_FILE_RE = re.compile(r'class([a-zA-Z_][a-zA-Z0-9_]*)_1_1([a-zA-Z_][a-zA-Z0-9_]*)\.html')
21 CLASS_RE = re.compile('fw4spl: (.+) Class Reference')
22 STRUCT_FILE_RE = re.compile(r'struct([a-zA-Z_][a-zA-Z0-9_]*)_1_1([a-zA-Z_][a-zA-Z0-9_]*)\.html')
23 STRUCT_RE = re.compile('fw4spl: (.+) Struct Reference')
24 NAMESPACE_FILE_RE = re.compile(r'namespace.+\.html')
25 NAMESPACE_RE = re.compile('fw4spl: ([a-zA-Z_][a-zA-Z0-9_:]*) Namespace Reference')
26 SRV_RE = re.compile('fw4spl: ([a-zA-Z_][a-zA-Z0-9_]*::(?:[a-zA-Z_][a-zA-Z0-9_]*::)*(S[A-Z0-9][a-zA-Z0-9_]*)) Class Reference')
27 BAD__SRV_RE = re.compile('fw4spl: ([a-zA-Z_][a-zA-Z0-9_]*::(?:[a-zA-Z_][a-zA-Z0-9_]*::)*([A-Z0-9][a-zA-Z0-9_]*)) Class Reference')
28 OBJ_RE = re.compile('fw4spl: ([a-zA-Z_][a-zA-Z0-9_]*::(?:[a-zA-Z_][a-zA-Z0-9_]*::)*([A-Z0-9][a-zA-Z0-9_]*)) Class Reference')
29 IFACE_RE = re.compile('fw4spl: ([a-zA-Z_][a-zA-Z0-9_]*::(?:[a-zA-Z_][a-zA-Z0-9_]*::)*(I[A-Z0-9][a-zA-Z0-9_]*|IService)) Class Reference')
30 EXCEPT_RE = re.compile('fw4spl: ([a-zA-Z_][a-zA-Z0-9_]*::(?:[a-zA-Z_][a-zA-Z0-9_]*::)*([A-Z0-9][a-zA-Z0-9_]*)) Struct Reference')
31 
32 # Regexes of the files to skip
33 FILE_SKIP_RE = [
34  re.compile('pages.html'),
35  re.compile(r'dir_.+\.html'),
36  re.compile('.+_source.html')
37 ]
38 
40  """
41  Create the skeleton for the docset, i.e. the directory structure along with the SQLite database. Return the SQLite
42  database connection.
43  """
44  # Create the directory structure.
45  Path('./fw4spl.docset/Contents/Resources').mkdir(parents=True, exist_ok=True)
46 
47  # Then, create the SQLite database
48  db = Path('./fw4spl.docset/Contents/Resources/docSet.dsidx')
49  if db.exists():
50  os.remove(str(db))
51  conn_ = sqlite3.connect(str(db))
52  cur = conn_.cursor()
53  cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
54  cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
55  conn_.commit()
56  return conn_
57 
59  """
60  Return a list containing the paths to all interesting HTML files contained at the root of the Doxygen html
61  directory. We're not interested in what's in the subdirectories.
62  """
63  files = list()
64  for _, _, dir_files in os.walk('./html/'):
65  files += [f for f in dir_files if f.endswith('.html')]
66  return files
67 
69  """
70  Parse the 'pages.html' doxygen file and generate the list of related pages.
71  """
72  pages = list()
73  html = open(os.path.join('./html', 'pages.html'), encoding="utf8").read()
74  soup = bs4.BeautifulSoup(html, "html.parser")
75  table = soup.find("table", class_="directory")
76  for cell in table.find_all("tr"):
77  page_name = cell.td.a.string
78  page_link = cell.td.a.get('href')
79  pages.append((page_name, "Guide", page_link))
80  return pages
81 
82 # TODO: strip the leading repository path from the HTML file to get rid of the ugly full paths ?
83 def file_repo(f_soup):
84  """
85  Return the name of the repository that a particular documentation file was generated from, or None if not possible.
86  """
87  lists = f_soup.find_all('ul')
88  if lists:
89  file_path = lists[-1].li.get_text()
90  for repo in CFG['repositories']:
91  candidates = [repo for repo in CFG['repositories'] if file_path.startswith(repo)]
92  if candidates:
93  res = max(candidates, key=len)
94  return REPO_NAMES[res]
95  return None
96 
97 def parse_file(f_):
98  """
99  Parse a HTML file and return a (potentially empty) list of 3-tuples to add to the SQLite database.
100  """
101  # Doxygen names the documentation files in a friendly manner, which means we can guess what is inside from the file
102  # name, and ignore files that we know we don't care about. This script currently looks for files containing classes
103  # or structs.
104  new_entries = list()
105  # Some files are of no interest to us and can be skipped
106  if any(map(lambda regexp: regexp.match(f_), FILE_SKIP_RE)):
107  return new_entries
108  try:
109  html = open(os.path.join('./html', f_), encoding="utf8").read()
110  soup = bs4.BeautifulSoup(html, "html.parser")
111  inherits_iservice = soup.find(class_='inherit_header pub_methods_classfwServices_1_1IService')
112  inherits_object = soup.find(class_='inherit_header pub_methods_classfwData_1_1Object')
113  inherits_exception = soup.find(class_='inherit_header pub_methods_classfwCore_1_1Exception')
114 
115  item_type_re = {
116  "Service": SRV_RE,
117  "Object": OBJ_RE,
118  "Interface": IFACE_RE,
119  "Exception": EXCEPT_RE
120  }
121  def is_item_type(soup, ty_str):
122  """
123  Test if the HTML contained in the supplied soup describes and element of the specified type based on the
124  doxygen page title. Accepted types are 'Service', 'Object', 'Interface' and 'Exception'. If true, return an
125  entry to add to the sqlite DB, else return None.
126  """
127  title = soup.title.get_text()
128  match = item_type_re[ty_str].search(title)
129  if match:
130  path = match.group(1)
131  repo = file_repo(soup)
132  if repo is not None:
133  path = path + " ({})".format(repo)
134  return (path, ty_str, f_)
135  return None
136 
137  def is_bad_service(soup):
138  """
139  Test if the HTML contained in the supplied soup describes a service, with more lenient rules regarding
140  the name of the service. If true, print a warning regarding the service name and return an entry to add to
141  the sqlite DB, otherwise return None.
142  """
143  title = soup.title.get_text()
144  match = BAD__SRV_RE.search(title)
145  if match:
146  path = match.group(1)
147  srv = match.group(2)
148  repo = file_repo(soup)
149  if repo is not None:
150  path = path + " ({})".format(repo)
151  print("Warning: service {} has non compliant name (no S prefix)".format(srv))
152  return (path, "Service", f_)
153  return None
154 
155  file_type_re = {
156  "Class": CLASS_RE,
157  "Namespace": NAMESPACE_RE,
158  "Struct": STRUCT_RE,
159  }
160  def is_file_type(soup, ty_str):
161  """
162  Test if the HTML contained in the supplied soup describes and element of the specified type based on the
163  doxygen page title. Accepted types are 'Class', 'Namespace', and 'Struct'. If true, return an
164  entry to add to the sqlite DB, else return None.
165  """
166  title = soup.title.get_text()
167  match = file_type_re[ty_str].search(title)
168  if match:
169  struct_ = match.group(1)
170  return (struct_, ty_str, f_)
171  return None
172 
173  if CLASS_FILE_RE.match(f_):
174  # We know the file contains a class, find what kind of class
175  class_triple = is_file_type(soup, 'Class')
176  if class_triple is None:
177  return new_entries
178  class_name = class_triple[0]
179  if inherits_iservice:
180  # The class inherits IService, it can be a service or an interface
181  triple = is_item_type(soup, 'Interface')
182  if triple is not None:
183  new_entries.append(triple)
184  else:
185  # Not an interface, probably a service
186  triple = is_item_type(soup, 'Service')
187  if triple is not None:
188  new_entries.append(triple)
189  else:
190  triple = is_bad_service(soup)
191  if triple is not None:
192  new_entries.append(triple)
193  else:
194  print("Warning: unexepected behaviour for class {} while parsing file {}".format(class_name, f_))
195  elif class_name == "fwData::Object":
196  # Special case, Object is not an actual data.
197  new_entries.append((class_name, "Class", f_))
198  elif inherits_object:
199  # Not a service and inherits fwData::Object, this class is probably a data.
200  triple = is_item_type(soup, 'Object')
201  if triple is not None:
202  new_entries.append(triple)
203  elif class_name == "fwCore::Exception":
204  # Special case for fwCore::Exception
205  new_entries.append((class_name, "Exception", f_))
206  elif inherits_exception:
207  # Inherits an exception type, this is probably an exception
208  # TODO: I'm pretty sure this won't catch all exceptions in the codebase
209  triple = is_item_type(soup, 'Exception')
210  if triple is not None:
211  new_entries.append(triple)
212  else:
213  # Plain old class
214  new_entries.append(class_triple)
215  elif STRUCT_FILE_RE.match(f_):
216  # We know the file contains a struct, find what kind of struct
217  struct_triple = is_file_type(soup, 'Struct')
218  if struct_triple is None:
219  return new_entries
220  new_entries.append(struct_triple)
221  if inherits_exception:
222  # Inherits an exception type, this is probably an exception
223  # TODO: I'm pretty sure this won't catch all exceptions in the codebase
224  triple = is_item_type(soup, 'Exception')
225  if triple is not None:
226  new_entries.append(triple)
227  elif NAMESPACE_FILE_RE.match(f_):
228  # We know the file contains a namespace, find what kind of namespace (i.e. Bundle, Library, regular
229  # namespace...)
230  namespace_triple = is_file_type(soup, 'Namespace')
231  if namespace_triple is None:
232  return new_entries
233  namespace_name = namespace_triple[0]
234  if namespace_name in CFG['srclibs']:
235  new_entries.append((namespace_name, "Library", f_))
236  elif namespace_name in CFG['bundles']:
237  # There is no 'Bundle' entry type, unfortunately. Component, Package or Module would be suitable
238  # replacements. I chose Package.
239  new_entries.append((namespace_name, "Package", f_))
240  else:
241  new_entries.append(namespace_triple)
242  except UnicodeDecodeError:
243  print('The file ' + f_ + ' is not valid UTF-8')
244  except FileNotFoundError:
245  # Occurs for files in the search subdirectory, it's OK, we don't care about those
246  pass
247  return new_entries
248 
249 def populate_db(conn_, services):
250  """
251  Fill the sqlite database with the supplied list of (name, entry_type, file_path) triples.
252  """
253  cur = conn_.cursor()
254  for triple in services:
255  try:
256  cur.execute("INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?, ?, ?);", triple)
257  except Exception:
258  print("Error inserting " + str(triple))
259  conn_.commit()
260 
262  """
263  Copy the doxygen HTML files into the docset destination.
264  """
265  try:
266  shutil.copytree('./html', './fw4spl.docset/Contents/Resources/Documents')
267  except shutil.Error as err:
268  errors = err.args[0]
269  print("Warning: some files were not copied correctly. The generated docset might be incomplete.")
270  for src, _, why in errors:
271  print("File '" + src + "' was not copied correctly. Reason: " + why)
272 
273 def main():
274  """
275  Builds the dash docset.
276  """
277  global CFG
278  global REPO_NAMES
279  try:
280  CFG = json.loads(open('./projects.json', encoding="utf8").read())
281  except (OSError, json.JSONDecodeError) as err:
282  print("Error loading configuration file: " + str(err))
283  return
284  REPO_NAMES = {repo: Path(repo).parent.name if Path(repo).name == "src" else Path(repo).name for repo in CFG['repositories']}
285  if CFG is None:
286  sys.exit(1)
287  conn = bootstrap_docset()
288  html_files = gather_sources()
289  entries = list()
290  for f in html_files:
291  f_entries = parse_file(f)
292  if f_entries:
293  entries += f_entries
294  entries += parse_related_pages()
295  populate_db(conn, entries)
296  copy_files()
297  conn.close()
298 
299 if __name__ == '__main__':
300  main()
def populate_db(conn_, services)
def gather_sources()
Definition: build_docset.py:58
def copy_files()
def file_repo(f_soup)
Definition: build_docset.py:83
def parse_related_pages()
Definition: build_docset.py:68
def bootstrap_docset()
Definition: build_docset.py:39
def parse_file(f_)
Definition: build_docset.py:97