|
|
def all_parens(s,pairs = ['()','{}','[]']): pair_levels = dict((pair,0) for pair in pairs) stored_levels = dict((pair,{}) for pair in pairs) ret = {} for char_index in range(len(s)): try: pair_type = next(filter(lambda pair: s[char_index] in pair and not s[char_index-1] == '\\',pairs)) type = pair_type.index(s[char_index]) if type == 0: pair_levels[pair_type] += 1 stored_levels[pair_type][pair_levels[pair_type]] = char_index else: try: level = pair_levels[pair_type] start = stored_levels[pair_type][level] ret[start] = char_index except KeyError: pass pair_levels[pair_type] -= 1 except StopIteration: pass return ret def string_levels(s,*pairs): if not pairs: pairs = ['()','{}','[]'] all = all_parens(s,pairs) print(s) for key in sorted(all.keys()): value = all[key] yield s[key:value+1] class base_regex_blob(object): def __init__(self,string,start,end): self.start = start self.end = end self.string = string self.parent = None def __str__(self): return self.string[self.start:self.end+1] def __repr__(self): return repr(str(self)) def wrap(self): return 'r"{}"'.format(self) def __lt__(self,other): return self.start < other.start and self.end > other.start def to_dict(self): return { 'single':str(self), 'isblob':True }class regex_group(base_regex_blob): def from_string(string,*pairs): if not pairs: pairs = ['()','{}','[]'] _all = all_parens(string,pairs) groups = ( regex_group(string,key,_all[key]) for key in sorted(_all.keys()) ) root = regex_group(test,0,len(test)-1) for group in groups: root._handover_(group) root.blobify() return root def __init__(self,string,start,end): super().__init__(string,start,end) self.children = [] self.blobs = [] def __contains__(self,other): return self.start <= other.start and self.end >= other.end def __getitem__(self,key): return self.children[key] def __len__(self): base = len(self.children) return base + sum(map(len,self.children)) def _handover_(self,group): if group in self: for child in self.children: child._handover_(group) if not group.parent: group.parent = self self.children.append(group) def _make_blobs_(self): indices = [] for child in self.children: indices += [child.start,child.end] indices = indices[1:-1] for pair in range(0,len(indices),2): start,stop = indices[pair:pair+2] if stop-start > 1: blob = base_regex_blob(self.string,start+1,stop-1) blob.parent = self self.blobs.append(blob) def to_dict(self): single = '' all = [] if self.children: all = sorted(self.blobs+self.children) else: single = str(self) return { "single":single, "children":[child.to_dict() for child in all], "isblob":False, }
def blobify(self): self._make_blobs_() for child in self.children: child.blobify()if __name__ == "__main__": test = r"(?P<show>(\w+)(\W\w+)*)\W*[Ss](eason|(?=\d))\D*(?P<season>\d+)\W*[Ee](pisode|(?=\d))\D*(?P<episode>\d+\w?([^\d\w&]*&[^\d\w&]\d+\w?)?)\W*(?P<other>.*)(?P<ext>\.\w+)$" root = regex_group.from_string(test,'()','[]') import pprint pprint.pprint(root.to_dict())
|