| import os |
| import re |
| from pathlib import Path |
| import yaml |
|
|
| |
| REPO_ROOT = Path(__file__).resolve().parent.parent |
|
|
| |
| KEYWORD_TAGS = [ |
| "CCore", "CShell", "REPL", "Mesh", "Agent", "HMP", |
| "MeshConsensus", "CogSync", "GMP", "EGP", |
| "Ethics", "Scenarios", "JSON" |
| ] |
|
|
| ROOT_DIR = Path(".") |
| STRUCTURED_DIR = ROOT_DIR / "structured_md" |
| INDEX_FILE = STRUCTURED_DIR / "index.md" |
|
|
| MD_EXT = ".md" |
|
|
| |
| JSON_LD_TEMPLATES = { |
| "FAQ": """\n```json |
| {{ |
| "@context": "https://schema.org", |
| "@type": "FAQPage", |
| "mainEntity": {main_entity} |
| }} |
| ```\n""", |
| "HowTo": """\n```json |
| {{ |
| "@context": "https://schema.org", |
| "@type": "HowTo", |
| "name": "{title}", |
| "description": "{description}", |
| "step": {steps} |
| }} |
| ```\n""", |
| "Article": """\n```json |
| {{ |
| "@context": "https://schema.org", |
| "@type": "Article", |
| "name": "{title}", |
| "description": "{description}" |
| }} |
| ```\n""" |
| } |
|
|
| FRONT_MATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL) |
|
|
| def is_md_file(path): |
| return path.suffix.lower() == MD_EXT and STRUCTURED_DIR not in path.parents |
|
|
| def extract_front_matter(content: str): |
| """Возвращает (front_matter_dict, clean_content) — без YAML-шапки.""" |
| match = FRONT_MATTER_RE.match(content) |
| if match: |
| try: |
| data = yaml.safe_load(match.group(1)) or {} |
| except Exception: |
| data = {} |
| clean = content[match.end():] |
| return data, clean |
| return {}, content |
|
|
| def detect_file_type(content: str, front_matter: dict | None = None) -> str: |
| """Определяет тип: FAQ / HowTo / Article (по front-matter или заголовкам).""" |
| front_matter = front_matter or {} |
| if "type" in front_matter: |
| return front_matter["type"] |
|
|
| |
| if re.search(r"^#\s*FAQ\b", content, re.MULTILINE) or re.search(r"^##\s*Q&A\b", content, re.MULTILINE): |
| return "FAQ" |
| if re.search(r"^#\s*HowTo\b", content, re.MULTILINE) or re.search(r"^#\s*Как\s+сделать\b", content, re.IGNORECASE | re.MULTILINE): |
| return "HowTo" |
| return "Article" |
|
|
| def parse_front_matter(content): |
| match = FRONT_MATTER_RE.match(content) |
| if match: |
| try: |
| data = yaml.safe_load(match.group(1)) |
| return data |
| except Exception: |
| pass |
| return {} |
|
|
| def determine_type(content, front_matter): |
| if "type" in front_matter: |
| return front_matter["type"] |
| |
| if re.search(r"^#.*FAQ", content, re.MULTILINE): |
| return "FAQ" |
| if re.search(r"^#.*HowTo", content, re.MULTILINE): |
| return "HowTo" |
| return "Article" |
|
|
| def generate_json_ld(content, front_matter, ftype, title, rel_path): |
| desc = front_matter.get("description", content[:100].replace("\n", " ") + "...") |
| url = f"structured_md/{rel_path.as_posix()}" |
|
|
| if ftype == "FAQ": |
| q_matches = re.findall(r"^##\s*(.+)$", content, re.MULTILINE) |
| main_entity = [] |
| for q in q_matches: |
| ans_match = re.search(rf"##\s*{re.escape(q)}\s*\n(.+?)(\n##|\Z)", content, re.DOTALL) |
| answer_text = ans_match.group(1).strip() if ans_match else "" |
| main_entity.append({ |
| "@type": "Question", |
| "name": q, |
| "acceptedAnswer": {"@type": "Answer", "text": answer_text} |
| }) |
| import json |
| return JSON_LD_TEMPLATES["FAQ"].format( |
| main_entity=json.dumps(main_entity, ensure_ascii=False, indent=2) |
| ).replace("}}", f',\n "url": "{url}"\n}}', 1) |
|
|
| elif ftype == "HowTo": |
| steps = [{"@type": "HowToStep", "name": s.strip()} for s in re.findall(r"^- (.+)$", content, re.MULTILINE)] |
| import json |
| return JSON_LD_TEMPLATES["HowTo"].format( |
| title=title, description=desc, steps=json.dumps(steps, ensure_ascii=False, indent=2) |
| ).replace("}}", f',\n "url": "{url}"\n}}', 1) |
|
|
| else: |
| return JSON_LD_TEMPLATES["Article"].format( |
| title=title, description=desc |
| ).replace("}}", f',\n "url": "{url}"\n}}', 1) |
|
|
| def add_index_link(content, file_path): |
| |
| rel_path = os.path.relpath(STRUCTURED_DIR / "index.md", file_path.parent) |
| link_line = f"\n\n---\n> ⚡ [AI friendly version docs (structured_md)]({rel_path})\n" |
| if link_line.strip() not in content: |
| content += link_line |
| return content |
|
|
| def extract_tags(content, existing_tags): |
| tags = set(existing_tags or []) |
| for kw in KEYWORD_TAGS: |
| if kw.lower() in content.lower(): |
| tags.add(kw) |
| return list(tags) |
|
|
| def mirror_md_files(): |
| processed = [] |
| for path in REPO_ROOT.rglob("*.md"): |
| if "structured_md" in path.parts or path.name.lower() == "index.md": |
| continue |
|
|
| rel_path = path.relative_to(REPO_ROOT) |
| target_path = STRUCTURED_DIR / rel_path |
| target_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| with path.open("r", encoding="utf-8") as f: |
| content = f.read() |
|
|
| front_matter, clean_content = extract_front_matter(content) |
| ftype = detect_file_type(clean_content, front_matter) |
|
|
| |
| h1_match = re.search(r"^#\s*(.+)$", clean_content, re.MULTILINE) |
| if h1_match: |
| title = h1_match.group(1).strip() |
| rest_content = clean_content[h1_match.end():].strip() |
| description = front_matter.get("description", rest_content[:200].replace("\n", " ") + "...") |
| else: |
| title = front_matter.get("title", path.stem) |
| description = front_matter.get("description", clean_content[:200].replace("\n", " ") + "...") |
|
|
| tags = extract_tags(clean_content, front_matter.get("tags", [])) |
|
|
| |
| fm_dict = { |
| "title": title, |
| "description": description, |
| "type": ftype, |
| "tags": tags, |
| } |
| yaml_fm = "---\n" + yaml.safe_dump(fm_dict, sort_keys=False, allow_unicode=True) + "---\n\n" |
|
|
| |
| clean_content = add_index_link(clean_content, target_path) |
|
|
| |
| json_ld = generate_json_ld(clean_content, front_matter, ftype, title, rel_path) |
|
|
| |
| with target_path.open("w", encoding="utf-8") as f: |
| f.write(yaml_fm) |
| f.write(clean_content.rstrip()) |
| f.write("\n\n") |
| f.write(json_ld) |
|
|
| processed.append(rel_path) |
|
|
| return processed |
| |
| def generate_index(files): |
| index_lines = ["# ИИ-дружелюбные версии файлов\n"] |
| tree = {} |
|
|
| for f in files: |
| parts = list(f.parts) |
| d = tree |
| for p in parts[:-1]: |
| d = d.setdefault(p, {}) |
| d[parts[-1]] = None |
|
|
| def render_tree(d, parent_path="", level=0): |
| lines = [] |
| for name, sub in sorted(d.items()): |
| indent = " " * level |
| full_path = Path(parent_path) / name |
| if sub is None: |
| lines.append(f"{indent}- [{name}]({full_path.as_posix()})") |
| else: |
| lines.append(f"{indent}- {name}") |
| lines.extend(render_tree(sub, full_path, level + 1)) |
| return lines |
|
|
| index_lines.extend(render_tree(tree)) |
|
|
| INDEX_FILE.parent.mkdir(parents=True, exist_ok=True) |
| with open(INDEX_FILE, "w", encoding="utf-8") as f: |
| f.write("\n".join(index_lines)) |
|
|
| if __name__ == "__main__": |
| STRUCTURED_DIR.mkdir(exist_ok=True) |
| md_files = mirror_md_files() |
| generate_index(md_files) |
| print(f"Обработано {len(md_files)} файлов. Индекс создан: {INDEX_FILE}") |
|
|