Core Features

Dataset Reporting

Generate comprehensive reports and analytics for your TXT-format datasets.

Overview

cvPal's dataset reporting feature provides detailed analytics and insights about your datasets. Generate comprehensive reports that help you understand data distribution, quality, and structure for better model training decisions.

Report Features

Comprehensive Analysis

Detailed statistics and distributions

Pandas Integration

Export to CSV and advanced analysis

Quality Assessment

Identify issues and improvements

Basic Usage

Generate comprehensive dataset reports using the report function:

Generate Dataset Report

python
from cvpal.preprocessing import report
import pandas as pd
# Generate comprehensive dataset report
df_report = report("/path/to/your/dataset")
# Display basic information
print(f"Total images: {len(df_report)}")
print(f"Average labels per image: {df_report['num_of_labels'].mean():.2f}")
print(f"Max labels in single image: {df_report['num_of_labels'].max()}")
print(f"Min labels in single image: {df_report['num_of_labels'].min()}")

πŸ“Š Report Structure

  • image_path - Full path to image file
  • label_path - Full path to label file
  • num_of_labels - Number of objects in image
  • labels - List of label names in image
  • directory - Split (train/test/valid)

🎯 Use Cases

  • β€’ Dataset quality assessment
  • β€’ Label distribution analysis
  • β€’ Split balance verification
  • β€’ Data cleaning and filtering
  • β€’ Training preparation

Split Analysis

Analyze the distribution of data across train, test, and validation splits:

Split Distribution Analysis

python
# Analyze split distribution
split_counts = df_report['directory'].value_counts()
print("Split distribution:")
print(split_counts)
# Calculate percentages
total_images = len(df_report)
for split, count in split_counts.items():
percentage = (count / total_images) * 100
print(f"{split}: {count} images ({percentage:.1f}%)")
# Check for empty splits
empty_splits = []
for split in ['train', 'test', 'valid']:
if split not in split_counts:
empty_splits.append(split)
if empty_splits:
print(f"Warning: Empty splits found: {empty_splits}")

Split Balance Recommendations

Train Split

70-80% of total data

Validation Split

10-15% of total data

Test Split

10-15% of total data

Label Analysis

Analyze label distribution and frequency across your dataset:

Label Frequency Analysis

python
# Extract all labels from the report
all_labels = []
for labels in df_report['labels']:
all_labels.extend(labels)
# Count label frequency
label_counts = pd.Series(all_labels).value_counts()
print("Label frequency distribution:")
print(label_counts)
# Calculate label statistics
total_labels = len(all_labels)
unique_labels = len(label_counts)
print(f"\nTotal label instances: {total_labels}")
print(f"Unique labels: {unique_labels}")
print(f"Average labels per image: {total_labels / len(df_report):.2f}")
# Find most/least common labels
most_common = label_counts.head(5)
least_common = label_counts.tail(5)
print(f"\nMost common labels: {most_common.to_dict()}")
print(f"Least common labels: {least_common.to_dict()}")

Class Imbalance Detection

python
# Detect class imbalance
max_count = label_counts.max()
min_count = label_counts.min()
imbalance_ratio = max_count / min_count
print(f"Class imbalance ratio: {imbalance_ratio:.2f}")
if imbalance_ratio > 10:
print("⚠️ High class imbalance detected!")
print("Consider data augmentation or resampling strategies")
elif imbalance_ratio > 5:
print("⚠️ Moderate class imbalance detected")
print("Monitor training performance carefully")
else:
print("βœ… Class distribution is relatively balanced")
# Identify problematic labels
problematic_labels = label_counts[label_counts < 10]
if len(problematic_labels) > 0:
print(f"\nLabels with very few instances (<10): {problematic_labels.to_dict()}")

Image Analysis

Analyze the distribution of objects per image and identify potential issues:

Objects Per Image Analysis

python
# Analyze objects per image
objects_per_image = df_report['num_of_labels']
print("Objects per image statistics:")
print(f"Mean: {objects_per_image.mean():.2f}")
print(f"Median: {objects_per_image.median():.2f}")
print(f"Standard deviation: {objects_per_image.std():.2f}")
print(f"Min: {objects_per_image.min()}")
print(f"Max: {objects_per_image.max()}")
# Distribution analysis
print("\nDistribution of objects per image:")
distribution = objects_per_image.value_counts().sort_index()
print(distribution)
# Find images with unusual number of objects
empty_images = df_report[df_report['num_of_labels'] == 0]
crowded_images = df_report[df_report['num_of_labels'] > 20]
print(f"\nEmpty images (no objects): {len(empty_images)}")
print(f"Crowded images ({'>'}20 objects): {len(crowded_images)}")
if len(empty_images) > 0:
print("\nEmpty images found:")
print(empty_images[['image_path', 'directory']].head())

Quality Indicators

βœ… Good Signs

  • β€’ Balanced split distribution
  • β€’ Reasonable objects per image (1-10)
  • β€’ No empty images
  • β€’ Low class imbalance ratio
  • β€’ Consistent labeling

⚠️ Warning Signs

  • β€’ Highly imbalanced splits
  • β€’ Many empty images
  • β€’ Extreme class imbalance (>10:1)
  • β€’ Very crowded images (>20 objects)
  • β€’ Missing label files

Export and Visualization

Export reports and create visualizations for better insights:

Export Reports

python
# Export full report to CSV
df_report.to_csv("dataset_report.csv", index=False)
print("Full report exported to dataset_report.csv")
# Export summary statistics
summary_stats = {
'total_images': len(df_report),
'total_labels': len(all_labels),
'unique_labels': len(label_counts),
'avg_labels_per_image': objects_per_image.mean(),
'class_imbalance_ratio': imbalance_ratio,
'empty_images': len(empty_images),
'crowded_images': len(crowded_images)
}
# Create summary DataFrame
summary_df = pd.DataFrame([summary_stats])
summary_df.to_csv("dataset_summary.csv", index=False)
print("Summary statistics exported to dataset_summary.csv")
# Export label frequency
label_counts.to_csv("label_frequency.csv", header=['count'])
print("Label frequency exported to label_frequency.csv")

Create Visualizations

python
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
plt.style.use('seaborn-v0_8')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Split distribution
split_counts.plot(kind='bar', ax=axes[0,0], color=['#1f77b4', '#ff7f0e', '#2ca02c'])
axes[0,0].set_title('Dataset Split Distribution')
axes[0,0].set_ylabel('Number of Images')
# 2. Label frequency (top 10)
label_counts.head(10).plot(kind='bar', ax=axes[0,1], color='#ff7f0e')
axes[0,1].set_title('Top 10 Most Frequent Labels')
axes[0,1].set_ylabel('Count')
axes[0,1].tick_params(axis='x', rotation=45)
# 3. Objects per image distribution
objects_per_image.hist(bins=20, ax=axes[1,0], color='#2ca02c', alpha=0.7)
axes[1,0].set_title('Distribution of Objects per Image')
axes[1,0].set_xlabel('Number of Objects')
axes[1,0].set_ylabel('Frequency')
# 4. Class imbalance visualization
imbalance_data = label_counts / label_counts.max()
imbalance_data.plot(kind='bar', ax=axes[1,1], color='#d62728')
axes[1,1].set_title('Class Imbalance (Normalized)')
axes[1,1].set_ylabel('Normalized Count')
axes[1,1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('dataset_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

Advanced Analysis

Perform deeper analysis to identify patterns and potential improvements:

Cross-Split Label Analysis

python
# Analyze label distribution across splits
split_label_analysis = {}
for split in ['train', 'test', 'valid']:
split_data = df_report[df_report['directory'] == split]
split_labels = []
for labels in split_data['labels']:
split_labels.extend(labels)
split_label_counts = pd.Series(split_labels).value_counts()
split_label_analysis[split] = split_label_counts
# Create cross-split comparison
all_unique_labels = set()
for split_counts in split_label_analysis.values():
all_unique_labels.update(split_counts.index)
# Check for missing labels in splits
missing_labels = {}
for split, counts in split_label_analysis.items():
missing = all_unique_labels - set(counts.index)
if missing:
missing_labels[split] = list(missing)
if missing_labels:
print("Labels missing in certain splits:")
for split, labels in missing_labels.items():
print(f"{split}: {labels}")
else:
print("βœ… All labels present in all splits")
# Calculate label coverage per split
coverage_analysis = {}
for split, counts in split_label_analysis.items():
coverage = len(counts) / len(all_unique_labels) * 100
coverage_analysis[split] = coverage
print(f"{split} label coverage: {coverage:.1f}%")

Dataset Quality Score

python
# Calculate dataset quality score
def calculate_quality_score(df_report, label_counts):
score = 100 # Start with perfect score
# Penalize empty images
empty_images = len(df_report[df_report['num_of_labels'] == 0])
if empty_images > 0:
score -= min(20, empty_images * 2) # Max 20 point penalty
# Penalize class imbalance
imbalance_ratio = label_counts.max() / label_counts.min()
if imbalance_ratio > 10:
score -= 30
elif imbalance_ratio > 5:
score -= 15
elif imbalance_ratio > 3:
score -= 5
# Penalize unbalanced splits
split_counts = df_report['directory'].value_counts()
if len(split_counts) < 3:
score -= 20 # Missing splits
# Penalize too few images
total_images = len(df_report)
if total_images < 100:
score -= 20
elif total_images < 500:
score -= 10
# Penalize too few labels
if len(label_counts) < 3:
score -= 15
return max(0, score)
quality_score = calculate_quality_score(df_report, label_counts)
print(f"Dataset Quality Score: {quality_score}/100")
if quality_score >= 80:
print("βœ… Excellent dataset quality!")
elif quality_score >= 60:
print("⚠️ Good dataset quality with room for improvement")
elif quality_score >= 40:
print("⚠️ Fair dataset quality - consider improvements")
else:
print("❌ Poor dataset quality - significant improvements needed")