Fix: Parse JSON Lines format from docker ps/images correctly

2026-02-06 18:00:23 +00:00
parent 13c2d1f43a
commit 8a59259165
1 changed files with 86 additions and 44 deletions
--- a/docker-overlay-cleanup.py
+++ b/docker-overlay-cleanup.py
@@ -38,68 +38,110 @@ def human_size(size: int) -> str:
        size /= 1024
    return f"{size:.1f} PB"

-def get_docker_json(cmd: List[str]) -> list:
-    """Run docker command and parse JSON output."""
+def run_docker_cmd(cmd: List[str]) -> subprocess.CompletedProcess:
+    """Run a docker command and return the result."""
    try:
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+        return subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+    except Exception as e:
+        return subprocess.CompletedProcess(cmd, 1, "", str(e))
+
+def parse_jsonl(text: str) -> List[dict]:
+    """Parse JSON Lines format (one JSON object per line)."""
+    result = []
+    for line in text.strip().split('\n'):
+        if line.strip():
+            try:
+                result.append(json.loads(line))
+            except json.JSONDecodeError:
+                pass
+    return result
+
+def get_docker_inspect(ids: List[str]) -> List[dict]:
+    """Run docker inspect on a list of IDs and return parsed JSON."""
+    if not ids:
+        return []
+    try:
+        result = subprocess.run(
+            ["docker", "inspect"] + ids,
+            capture_output=True, text=True, timeout=120
+        )
        if result.returncode == 0 and result.stdout.strip():
            return json.loads(result.stdout)
    except:
        pass
    return []

+def extract_layer_ids(graphdriver_data: dict) -> Set[str]:
+    """Extract overlay2 layer IDs from GraphDriver data."""
+    layer_ids = set()
+    for key in ["MergedDir", "UpperDir", "WorkDir", "LowerDir"]:
+        path = graphdriver_data.get(key, "")
+        # LowerDir can have multiple paths separated by ':'
+        for p in path.split(":"):
+            if OVERLAY2_PATH in p:
+                # Extract layer ID from path like /var/lib/docker/overlay2/abc123/diff
+                parts = p.replace(OVERLAY2_PATH + "/", "").split("/")
+                if parts:
+                    layer_id = parts[0]
+                    if layer_id and layer_id != "l":
+                        layer_ids.add(layer_id)
+    return layer_ids
+
 def get_referenced_layers() -> Dict[str, str]:
    """Get all overlay2 layer IDs referenced by containers and images.
    Returns dict of layer_id -> description."""
    referenced = {}
    
-    # Get all container references
-    containers = get_docker_json(["docker", "ps", "-a", "--format", "{{json .}}"])
+    # Get all container IDs and names
+    result = run_docker_cmd(["docker", "ps", "-a", "--format", "{{json .}}"])
+    containers = parse_jsonl(result.stdout) if result.returncode == 0 else []
+    
+    # Build a map of container ID -> name
+    container_names = {}
+    container_ids = []
    for c in containers:
-        if isinstance(c, str):
-            c = json.loads(c)
        cid = c.get("ID", "")
        cname = c.get("Names", cid)
-        
-        # Inspect container for GraphDriver data
-        inspect = get_docker_json(["docker", "inspect", cid])
-        if inspect:
-            gd = inspect[0].get("GraphDriver", {}).get("Data", {})
-            for key in ["MergedDir", "UpperDir", "WorkDir", "LowerDir"]:
-                path = gd.get(key, "")
-                # LowerDir can have multiple paths separated by ':'
-                for p in path.split(":"):
-                    if OVERLAY2_PATH in p:
-                        # Extract layer ID from path
-                        parts = p.replace(OVERLAY2_PATH + "/", "").split("/")
-                        if parts:
-                            layer_id = parts[0]
-                            if layer_id and layer_id != "l":
-                                referenced[layer_id] = f"container: {cname}"
+        if cid:
+            container_ids.append(cid)
+            container_names[cid] = cname
    
-    # Get all image references
-    images = get_docker_json(["docker", "images", "-a", "--format", "{{json .}}"])
+    # Inspect all containers at once (much faster)
+    if container_ids:
+        inspected = get_docker_inspect(container_ids)
+        for info in inspected:
+            cid = info.get("Id", "")[:12]
+            cname = container_names.get(cid, info.get("Name", cid).lstrip("/"))
+            gd = info.get("GraphDriver", {}).get("Data", {})
+            for layer_id in extract_layer_ids(gd):
+                referenced[layer_id] = f"container: {cname}"
+    
+    # Get all image IDs
+    result = run_docker_cmd(["docker", "images", "-a", "--format", "{{json .}}"])
+    images = parse_jsonl(result.stdout) if result.returncode == 0 else []
+    
+    # Build a map of image ID -> name
+    image_names = {}
+    image_ids = []
    for img in images:
-        if isinstance(img, str):
-            img = json.loads(img)
        img_id = img.get("ID", "")
-        img_name = img.get("Repository", "") + ":" + img.get("Tag", "")
-        if img_name == ":":
-            img_name = img_id[:12]
-        
-        inspect = get_docker_json(["docker", "inspect", img_id])
-        if inspect:
-            gd = inspect[0].get("GraphDriver", {}).get("Data", {})
-            for key in ["MergedDir", "UpperDir", "WorkDir", "LowerDir"]:
-                path = gd.get(key, "")
-                for p in path.split(":"):
-                    if OVERLAY2_PATH in p:
-                        parts = p.replace(OVERLAY2_PATH + "/", "").split("/")
-                        if parts:
-                            layer_id = parts[0]
-                            if layer_id and layer_id != "l":
-                                if layer_id not in referenced:
-                                    referenced[layer_id] = f"image: {img_name}"
+        repo = img.get("Repository", "")
+        tag = img.get("Tag", "")
+        img_name = f"{repo}:{tag}" if repo and tag else img_id[:12]
+        if img_id:
+            image_ids.append(img_id)
+            image_names[img_id] = img_name
+    
+    # Inspect all images at once
+    if image_ids:
+        inspected = get_docker_inspect(image_ids)
+        for info in inspected:
+            img_id = info.get("Id", "").replace("sha256:", "")[:12]
+            img_name = image_names.get(img_id, img_id)
+            gd = info.get("GraphDriver", {}).get("Data", {})
+            for layer_id in extract_layer_ids(gd):
+                if layer_id not in referenced:
+                    referenced[layer_id] = f"image: {img_name}"
    
    return referenced