Suspend/resume on A311D

Hello! I’ve been unable to suspend/resume on Pocket Reform on A311D – suspend appears to work but it’s impossible to resume, due to a kernel error during the process of suspending. I’ve attached a log (with loglevel=7) from over serial below, taken during sudo systemctl suspend. There’s no further output if I try to wake the device using the menu on the system controller. I’m curious:

  • Is this what people are seeing on A311D on other devices, either Pocket Reform or Reform?
  • Does anyone recognize this trace, before I dive deeper into it?
[   50.465859] wlx000a5205ed22: authenticate with 0c:80:63:cd:0f:f6 (local
address=00:0a:52:05:ed:22)
[   50.469429] wlx000a5205ed22: send auth to 0c:80:63:cd:0f:f6 (try 1/3)
[   50.880015] wlx000a5205ed22: send auth to 0c:80:63:cd:0f:f6 (try 2/3)
[   50.881489] wlx000a5205ed22: send auth to 0c:80:63:cd:0f:f6 (try 3/3)
[   51.296033] wlx000a5205ed22: authentication with 0c:80:63:cd:0f:f6 timed out
[   57.776632] panel-jdi-lt070me05000 ffd07000.dsi.0: vsync-shift from device tree: 1
[   78.654120] PM: suspend entry (deep)
[   78.662892] Filesystems sync: 0.008 seconds
[   79.004547] Freezing user space processes
[   79.006234] Freezing user space processes completed (elapsed 0.001 seconds)
[   79.009998] OOM killer disabled.
[   79.013194] Freezing remaining freezable tasks
[   79.018974] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
[   79.925031] meson-pcie fc000000.pcie: error: wait linkup timeout
[   79.925834] Disabling non-boot CPUs ...
[   79.930499] IRQ22: set affinity failed(-22).
[   79.931615] psci: CPU1 killed (polled 0 ms)
[   79.939232] IRQ22: set affinity failed(-22).
[   79.940314] psci: CPU2 killed (polled 4 ms)
[   79.948029] IRQ22: set affinity failed(-22).
[   79.949102] psci: CPU3 killed (polled 0 ms)
[   79.956013] IRQ22: set affinity failed(-22).
[   79.957087] psci: CPU4 killed (polled 0 ms)
[   79.964808] IRQ22: set affinity failed(-22).
[   79.965880] psci: CPU5 killed (polled 0 ms)
bl30 get wakeup sources!
process command 00000006
bl30 enter suspend!
Little core clk suspend rate 1000000000
Big core clk suspend rate 24000000
store restore gp0 pll
suspend_counter: 1
Enter ddr suspend
ddr suspend time: 15us
alarm=0S
process command 00000001
cec ver:2018/04/29
CEC cfg:0x0000
use vddee new table!
use vddee new table!
exit_reason:0x06
Enter ddr resume
ddr resume time: 123us
store restore gp0 pll
cfg15 3b00000
cfg15 63b00000
Lit[   79.972457] Enabling non-boot CPUs ...
[   79.972945] Detected VIPT I-cache on CPU1
[   79.973025] CPU1: Booted secondary processor 0x0000000001 [0x410fd034]
[   79.973873] CPU1 is up
[   79.984530] Detected VIPT I-cache on CPU2
[   79.984595] arch_timer: CPU2: Trapping CNTVCT access
[   79.984609] CPU2: Booted secondary processor 0x0000000100 [0x410fd092]
[   79.985091] cpufreq: cpufreq_online: CPU2: Running at unlisted initial frequency: 999999 KHz,
 changing to: 1000000 KHz
[   80.009931] CPU2 is up
[   80.012300] Detected VIPT I-cache on CPU3
[   80.012335] arch_timer: CPU3: Trapping CNTVCT access
[   80.012343] CPU3: Booted secondary processor 0x0000000101 [0x410fd092]
[   80.012885] CPU3 is up
[   80.030041] Detected VIPT I-cache on CPU4
[   80.030076] arch_timer: CPU4: Trapping CNTVCT access
[   80.030085] CPU4: Booted secondary processor 0x0000000102 [0x410fd092]
[   80.030654] CPU4 is up
[   80.047800] Detected VIPT I-cache on CPU5
[   80.047836] arch_timer: CPU5: Trapping CNTVCT access
[   80.047844] CPU5: Booted secondary processor 0x0000000103 [0x410fd092]
[   80.048675] CPU5 is up
tle core clk resume rate 1000000000
Big core clk resume rate 50000000
[   80.115334] meson-pcie fc000000.pcie: error: wait linkup timeout
[   80.116296] nvme 0000:01:00.0: Unable to change power state from D3hot to D0, device inaccess
ible
[   80.176107] meson-pcie fc000000.pcie: error: wait linkup timeout
[... 189 of these meson-pcie fc000000.pcie: error: wait linkup timeout messages]
[   89.918170] meson-pcie fc000000.pcie: error: wait linkup timeout
[   89.918957] nvme nvme0: Disabling device after reset failure: -19
[   91.023683] ------------[ cut here ]------------
[   91.024010] [CRTC:43:meson_crtc] vblank wait timed out
[   91.028751] WARNING: CPU: 4 PID: 1530 at drivers/gpu/drm/drm_atomic_helper.c:1682 drm_atomic_
helper_wait_for_vblanks.part.0+0x248/0x278
[   91.040138] Modules linked in: realtek snd_seq_dummy snd_hrtimer snd_seq snd_seq_device dwmac
_meson8b stmmac_platform stmmac mdio_mux_meson_g12a mdio_mux of_mdio rfcomm algif_hash algif_skc
ipher af_alg bnep mt76x2u mt76x2_common mt76x02_usb mt76_usb rtw88_8822cs binfmt_misc rtw88_8822
c rtw88_sdio mt76x02_lib rtw88_core mt76 hci_uart mac80211 btqca btrtl btintel btbcm bluetooth l
ibarc4 ecdh_generic ecc cfg80211 cdc_acm rfkill snd_soc_tlv320aic31xx reform2_lpc(OE) snd_soc_me
son_axg_sound_card snd_soc_meson_card_utils meson_gxbb_wdt snd_soc_meson_axg_tdmin meson_rng mes
on_saradc snd_soc_meson_axg_frddr snd_soc_meson_g12a_tohdmitx snd_soc_meson_axg_tdmout industria
lio snd_soc_meson_axg_tdm_interface leds_gpio nvmem_meson_efuse snd_soc_meson_axg_tdm_formatter
joydev evdev efi_pstore configfs nfnetlink ip_tables x_tables autofs4 ext4 mbcache jbd2 crc32c_g
eneric dm_crypt hid_generic dm_mod dax mali_dp snd_soc_meson_codec_glue usbhid hid mxsfb mux_mmi
o panel_edp imx_dcss ti_sn65dsi86 drm_dp_aux_bus nwl_dsi mux_core pwm_imx27
[   91.044321]  onboard_usb_dev snd_soc_hdmi_codec dw_hdmi_i2s_audio nvme nvme_core t10_pi xhci_
plat_hcd meson_gxl xhci_hcd smsc crc16 crc64_rocksoft_generic snd_soc_meson_axg_toddr snd_soc_me
son_axg_fifo dwc2 dwc3 meson_vdec(C) crc64_rocksoft snd_soc_core udc_core videobuf2_dma_contig c
rc_t10dif snd_compress videobuf2_memops crct10dif_generic polyval_ce v4l2_mem2mem snd_pcm_dmaeng
ine videobuf2_v4l2 polyval_generic usbcore ghash_ce crct10dif_ce ulpi gf128mul spi_gpio snd_pcm
videodev sha2_ce panfrost axg_audio pcs_xpcs videobuf2_common crc64 phylink dwc3_meson_g12a meso
n_dw_mipi_dsi snd_timer crct10dif_common sha256_arm64 drm_shmem_helper sclk_div spi_bitbang clk_
pwm gpu_sched reset_meson_audio_arb usb_common rtc_pcf8523 sha1_ce snd clk_phase meson_dw_hdmi s
oundcore fixed_phy dw_mipi_dsi meson_drm mc fwnode_mdio libphy rtc_meson_vrtc ao_cec_g12a meson_
canvas gpio_regulator pwm_regulator fixed cpufreq_dt aes_neon_bs aes_neon_blk aes_ce_blk aes_ce_
cipher [last unloaded: of_mdio]
[   91.216080] CPU: 4 PID: 1530 Comm: systemd-sleep Tainted: G         C OE      6.10.9-mnt-refo
rm-arm64 #1  Debian 6.10.9-1+reform20240916T143123Z
[   91.228977] Hardware name: MNT Pocket Reform with BPI-CM4 Module (DT)
[   91.235397] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   91.242323] pc : drm_atomic_helper_wait_for_vblanks.part.0+0x248/0x278
[   91.248822] lr : drm_atomic_helper_wait_for_vblanks.part.0+0x248/0x278
[   91.255322] sp : ffff800085deb470
[   91.258615] x29: ffff800085deb4a0 x28: 000000000000080e x27: 0000000000000000
[   91.265721] x26: 0000000000000001 x25: 0000000000000000 x24: ffff000002f97800
[   91.272828] x23: 0000000000000001 x22: 0000000000000000 x21: 0000000000000000
[   91.279934] x20: ffff00003f9b0f80 x19: ffff000003e30080 x18: 0000000000000006
[   91.287041] x17: 0000000000005048 x16: 000000000000506c x15: ffff800085deae80
[   91.294148] x14: 0000000000000000 x13: 74756f2064656d69 x12: ffff8000818502b0
[   91.301255] x11: 0000000000000001 x10: 0000000000000001 x9 : ffff800080250abc
[   91.308361] x8 : c0000000ffffefff x7 : ffff8000817f80e0 x6 : 0000000000000001
[   91.315468] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
[   91.322574] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff00001389b780
[   91.329685] Call trace:
[   91.332117]  drm_atomic_helper_wait_for_vblanks.part.0+0x248/0x278
[   91.338265]  drm_atomic_helper_commit_tail_rpm+0x8c/0xb0
[   91.343548]  commit_tail+0xac/0x1a0
[   91.347015]  drm_atomic_helper_commit+0x184/0x1a8
[   91.351696]  drm_atomic_commit+0xbc/0x100
[   91.355682]  drm_atomic_helper_commit_duplicated_state+0xfc/0x118
[   91.361748]  drm_atomic_helper_resume+0xb4/0x1a8
[   91.366341]  drm_mode_config_helper_resume+0x2c/0xd8
[   91.371282]  meson_drv_pm_resume+0x9c/0xb8 [meson_drm]
[   91.376396]  dpm_run_callback+0x98/0x1f8
[   91.380296]  device_resume+0x90/0x240
[   91.383935]  dpm_resume+0x170/0x240
[   91.387401]  dpm_resume_end+0x20/0x40
[   91.391041]  suspend_devices_and_enter+0x2bc/0x850
[   91.395808]  pm_suspend+0x21c/0x4f8
[   91.399275]  state_store+0x88/0x100
[   91.402742]  kobj_attr_store+0x18/0x30
[   91.406470]  sysfs_kf_write+0x4c/0x68
[   91.410108]  kernfs_fop_write_iter+0x13c/0x1d8
[   91.414528]  vfs_write+0x248/0x3b8
[   91.417909]  ksys_write+0x7c/0x120
[   91.421288]  __arm64_sys_write+0x24/0x38
[   91.425188]  invoke_syscall+0x78/0x108
[   91.428915]  el0_svc_common.constprop.0+0x48/0xf0
[   91.433595]  do_el0_svc+0x24/0x38
[   91.436888]  el0_svc+0x3c/0x128
[   91.440009]  el0t_64_sync_handler+0x120/0x130
[   91.444341]  el0t_64_sync+0x190/0x198
[   91.447988] ---[ end trace 0000000000000000 ]---
[   91.716757] xhci-hcd xhci-hcd.2.auto: xHC error in resume, USBSTS 0x411, Reinit
[   91.779635] usb usb1: root hub lost power or was reset
[   91.841867] usb usb2: root hub lost power or was reset
[   92.213361] usb 1-2: reset high-speed USB device number 2 using xhci-hcd
[   92.717080] usb 1-2.1: reset high-speed USB device number 3 using xhci-hcd
[   92.908961] usb 1-2.2: reset full-speed USB device number 4 using xhci-hcd
[   93.194563] usb 1-2.1.3: reset full-speed USB device number 5 using xhci-hcd
[   93.441161] usb 1-2.1.4: reset high-speed USB device number 6 using xhci-hcd
[   93.606532] mt76x2u 1-2.1.4:1.0: ROM patch build: 20141115060606a
[   93.844260] mt76x2u 1-2.1.4:1.0: Firmware Version: 0.0.00
[   93.905479] mt76x2u 1-2.1.4:1.0: Build: 1
[   93.965940] mt76x2u 1-2.1.4:1.0: Build Time: 201507311614____
1 Like

Yes, I’ve seen the same problem.

The main issue is that the A311D firmware or TF-A messes with an unknown number of clocks and sets them to a too low frequency, and Linux doesn’t know about this. Neil Armstrong from Baylibre told me that we could make progress here if we found a way to dump all the clocks before and after resume and then grab them off the device (over serial) and diff them. This would involve:

  • Finding a good spot in the kernel for this
  • Checking if there is already a clock tree dumping function (i.e. there is something in the kernel that generates the content of /sys/kernel/debug/clk/clk_summary which is what we want
  • Trigger this function directly before and after the suspend (actually directly after would be enough, because you can just grab the before-state from userspace at any time)
3 Likes

Thanks, that’s helpful to know you’re seeing this too.

Thinking through what you’re saying about the clocks — I get this trace before trying to resume, does this match your understanding?

It looks like it’s after resume. If you did not intend to resume maybe there is another bug that there is some interrupt that immediately triggers resume after suspend.

2 Likes

Yes, it instantly resumes, the “Enabling non-boot CPUs …” is a hint.

1 Like

I wrote a hacky patch that dumps /sys/kernel/debug/clk/clk_summary immediately before that “Enabling non-boot CPUs …” message in thaw_secondary_cpus():

It’s likely that in the after suspend dump the Linux kernel is showing cached values of what it thinks the clocks are, which may no longer be accurate.

Note that the dumping code in that patch is just a direct copy/paste of the clk_summary_*() logic in drivers/clk/clk.c that generates that sysfs file, but with seq_* prints replaced with pr_info()

I’ve actually been working on doing the same thing; I’ve compared your diff to mine and they’re nearly exactly the same, modulo some reordering towards the end and a small difference: mine has

- g12a_eth_core 0       0       0        166666664   0        0        50000     N
+ g12a_eth_core 1       1       0        166666664   0        0        50000     Y

and yours has

- g12a_vpu_intr 1       1        0        166666664   0          0     50000      Y                           ff600000.hdmi-tx                venci                    
+ g12a_vpu_intr 1       1        0        166666664   0          0     50000      N                           ff600000.hdmi-tx                venci

(Maybe your HDMI port was plugged in, maybe there’s a difference between where we dumped the clocks, or maybe we’re on different kernels?)

So the clocks that both of our results have differing pre- and post-suspend are:

  • g12a_ao_saradc_gate (disabled after resume)
  • ts (disabled after resume)
  • gp0_pll_dco (disabled after resume)
  • sys_pll_dco (enabled after resume)
  • ff64c00.mdio-multiplexer#pll (disabled after resume)
  • g12a_mipi_dsi_host (disabled after resume)
  • g12a_ao_saradc (disabled after resume)

I also added some other printk-style debug statements in kernel/power/suspend.c and ascertained that:

  • In suspend_enter, pm_sleep_disable_secondary_cpus() does not return an error.
  • In suspend_enter, syscore_suspend() does not return an error.
  • In suspend_devices_and_enter, the while loop exits when error and wakeup are both zero, so I guess it’s waking up because the result of platform_suspend_again(state) is nonzero.

My suspicion is that the change in some of the clocks related to video are causing the stacktrace in drm and meson I posted above. I’ve started working on a patch to restore all of these clocks to their previous states in meson_drv_pm_resume, that seems like the most natural place for a hack to validate this approach?

1 Like

I tried the following patch; it fails at the first clk_get. I think I don’t really understand how to use this API. In any case, I might not be able to look at this for a bit, so if anyone else wants to try please do:

diff --git a/drivers/gpu/drm/meson/meson_drv.c b/drivers/gpu/drm/meson/meson_drv.c
index 17a5cca007e2..3d1b85c0938e 100644
--- a/drivers/gpu/drm/meson/meson_drv.c
+++ b/drivers/gpu/drm/meson/meson_drv.c
@@ -14,6 +14,8 @@
 #include <linux/sys_soc.h>
 #include <linux/platform_device.h>
 #include <linux/soc/amlogic/meson-canvas.h>
+#include <linux/clk.h>
+
 
 #include <drm/drm_aperture.h>
 #include <drm/drm_atomic_helper.h>
@@ -432,6 +434,68 @@ static int __maybe_unused meson_drv_pm_suspend(struct device *dev)
 	return drm_mode_config_helper_suspend(priv->drm);
 }
 
+static void meson_fix_clocks(void)
+{
+	printk("meson-drv Fixing clocks\n");
+	// TODO this clk_get fails so nothing happens
+	struct clk *clk = clk_get(NULL, "g12a_ao_saradc_gate");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk g12a_ao_saradc_gate\n");
+		return;
+	}
+	clk_enable(clk);
+	clk_put(clk);
+
+	clk = clk_get(NULL, "ts");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk ts\n");
+		return;
+	}
+	clk_enable(clk);
+	clk_put(clk);
+
+	clk = clk_get(NULL, "gp0_pll_dco");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk gp0_pll_dco\n");
+		return;
+	}
+	clk_enable(clk);
+	clk_put(clk);
+
+	clk = clk_get(NULL, "sys_pll_dco");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk sys_pll_dco\n");
+		return;
+	}
+	clk_enable(clk);
+	clk_put(clk);
+
+	clk = clk_get(NULL, "ff64c00.mdio-multiplexer#pll");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk ff64c00.mdio-multiplexer#pll\n");
+		return;
+	}
+	clk_disable(clk);
+	clk_put(clk);
+
+	clk = clk_get(NULL, "g12a_mipi_dsi_host");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk g12a_mipi_dsi_host\n");
+		return;
+	}
+	clk_enable(clk);
+	clk_put(clk);
+
+	clk = clk_get(NULL, "g12a_ao_saradc");
+	if (IS_ERR(clk)) {
+		printk("Error getting clk g12a_ao_saradc\n");
+		return;
+	}
+	clk_enable(clk);
+	clk_put(clk);
+}
+
+
 static int __maybe_unused meson_drv_pm_resume(struct device *dev)
 {
 	struct meson_drm *priv = dev_get_drvdata(dev);
@@ -446,6 +510,7 @@ static int __maybe_unused meson_drv_pm_resume(struct device *dev)
 	if (priv->afbcd.ops)
 		priv->afbcd.ops->init(priv);
 
+	meson_fix_clocks();
 	return drm_mode_config_helper_resume(priv->drm);
 }