You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 lines
4.0 KiB

7 years ago
  1. def all_parens(s,pairs = ['()','{}','[]']):
  2. pair_levels = dict((pair,0) for pair in pairs)
  3. stored_levels = dict((pair,{}) for pair in pairs)
  4. ret = {}
  5. for char_index in range(len(s)):
  6. try:
  7. pair_type = next(filter(lambda pair: s[char_index] in pair and not s[char_index-1] == '\\',pairs))
  8. type = pair_type.index(s[char_index])
  9. if type == 0:
  10. pair_levels[pair_type] += 1
  11. stored_levels[pair_type][pair_levels[pair_type]] = char_index
  12. else:
  13. try:
  14. level = pair_levels[pair_type]
  15. start = stored_levels[pair_type][level]
  16. ret[start] = char_index
  17. except KeyError:
  18. pass
  19. pair_levels[pair_type] -= 1
  20. except StopIteration:
  21. pass
  22. return ret
  23. def string_levels(s,*pairs):
  24. if not pairs:
  25. pairs = ['()','{}','[]']
  26. all = all_parens(s,pairs)
  27. print(s)
  28. for key in sorted(all.keys()):
  29. value = all[key]
  30. yield s[key:value+1]
  31. class base_regex_blob(object):
  32. def __init__(self,string,start,end):
  33. self.start = start
  34. self.end = end
  35. self.string = string
  36. self.parent = None
  37. def __str__(self):
  38. return self.string[self.start:self.end+1]
  39. def __repr__(self):
  40. return repr(str(self))
  41. def wrap(self):
  42. return 'r"{}"'.format(self)
  43. def __lt__(self,other):
  44. return self.start < other.start and self.end > other.start
  45. def to_dict(self):
  46. return {
  47. 'single':str(self),
  48. 'isblob':True
  49. }
  50. class regex_group(base_regex_blob):
  51. def from_string(string,*pairs):
  52. if not pairs:
  53. pairs = ['()','{}','[]']
  54. _all = all_parens(string,pairs)
  55. groups = (
  56. regex_group(string,key,_all[key]) for key in sorted(_all.keys())
  57. )
  58. root = regex_group(test,0,len(test)-1)
  59. for group in groups:
  60. root._handover_(group)
  61. root.blobify()
  62. return root
  63. def __init__(self,string,start,end):
  64. super().__init__(string,start,end)
  65. self.children = []
  66. self.blobs = []
  67. def __contains__(self,other):
  68. return self.start <= other.start and self.end >= other.end
  69. def __getitem__(self,key):
  70. return self.children[key]
  71. def __len__(self):
  72. base = len(self.children)
  73. return base + sum(map(len,self.children))
  74. def _handover_(self,group):
  75. if group in self:
  76. for child in self.children:
  77. child._handover_(group)
  78. if not group.parent:
  79. group.parent = self
  80. self.children.append(group)
  81. def _make_blobs_(self):
  82. indices = []
  83. for child in self.children:
  84. indices += [child.start,child.end]
  85. indices = indices[1:-1]
  86. for pair in range(0,len(indices),2):
  87. start,stop = indices[pair:pair+2]
  88. if stop-start > 1:
  89. blob = base_regex_blob(self.string,start+1,stop-1)
  90. blob.parent = self
  91. self.blobs.append(blob)
  92. def to_dict(self):
  93. single = ''
  94. all = []
  95. if self.children:
  96. all = sorted(self.blobs+self.children)
  97. else:
  98. single = str(self)
  99. return {
  100. "single":single,
  101. "children":[child.to_dict() for child in all],
  102. "isblob":False,
  103. }
  104. def blobify(self):
  105. self._make_blobs_()
  106. for child in self.children:
  107. child.blobify()
  108. if __name__ == "__main__":
  109. test = r"(?P<show>(\w+)(\W\w+)*)\W*[Ss](eason|(?=\d))\D*(?P<season>\d+)\W*[Ee](pisode|(?=\d))\D*(?P<episode>\d+\w?([^\d\w&]*&[^\d\w&]\d+\w?)?)\W*(?P<other>.*)(?P<ext>\.\w+)$"
  110. root = regex_group.from_string(test,'()','[]')
  111. import pprint
  112. pprint.pprint(root.to_dict())