1. Print the waiting lock for "LWLockAcquire"
p *lockIn below sample, this process is waiting for lock "148" and the lock holder is PID 16108.
Core was generated by `postgres: port 5432, gpadmin template1 172.28.8.251(28066) con121275 172.28.8.2'. #0 0x00000036748cb696 in poll () from /lib64/libc.so.6 (gdb) bt #0 0x00000036748cb696 in poll () from /lib64/libc.so.6 #1 0x0000000000aff734 in ic_rx_thread_func (arg=<value optimized out>) at ic_udp.c:3695 #2 0x000000367540673d in start_thread () from /lib64/libpthread.so.0 #3 0x00000036748d44bd in clone () from /lib64/libc.so.6 (gdb) info thre 2 Thread 17194 0x00000036748d5887 in semop () from /lib64/libc.so.6 * 1 Thread 17195 0x00000036748cb696 in poll () from /lib64/libc.so.6 (gdb) thre 2 [Switching to thread 2 (Thread 17194)]#0 0x00000036748d5887 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00000036748d5887 in semop () from /lib64/libc.so.6 #1 0x0000000000854368 in PGSemaphoreLock (sema=0x2b67b685d910, interruptOK=0 '\000') at pg_sema.c:437 #2 0x00000000008f33f0 in LWLockAcquire (lockid=148, mode=<value optimized out>) at lwlock.c:561 #3 0x0000000000c0a621 in getTmLock (distribXid=<value optimized out>, localXid=<value optimized out>) at cdbtm.c:1706 #4 createDtx (distribXid=<value optimized out>, localXid=<value optimized out>) at cdbtm.c:2495 #5 0x00000000004fa1ff in StartTransaction () at xact.c:2995 #6 0x00000000004fa795 in StartTransactionCommand () at xact.c:4305 #7 0x0000000000904a0d in start_xact_command (query_string=<value optimized out>, seqServerHost=<value optimized out>, seqServerPort=<value optimized out>) at postgres.c:3048 #8 exec_simple_query (query_string=<value optimized out>, seqServerHost=<value optimized out>, seqServerPort=<value optimized out>) at postgres.c:1503 #9 0x0000000000905d97 in PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:4612 #10 0x000000000086dd61 in BackendRun (port=0x1263fc40) at postmaster.c:6779 #11 BackendStartup (port=0x1263fc40) at postmaster.c:6380 #12 0x0000000000874410 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2334 #13 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1546 #14 0x0000000000779eea in main (argc=18, argv=0x125ff030) at main.c:212 (gdb) f 2 #2 0x00000000008f33f0 in LWLockAcquire (lockid=148, mode=<value optimized out>) at lwlock.c:561 561 lwlock.c: No such file or directory. in lwlock.c (gdb) p *lock $1 = {mutex = 0 '\000', releaseOK = 1 '\001', exclusive = 1 '\001', shared = 0, exclusivePid = 16108, head = 0x2b67b6867b80, tail = 0x2b67b67ff1a0}Note:LWlock is lightweight lock in Greenplum and is different than the locks in pg_locks view.
2. Print the locks hold by the process
p held_lwlocks p held_lwlocks_exclusiveIn below sample, this process is holding lock 148.
(gdb) p held_lwlocks $2 = {148, NullLock <repeats 99 times>}
3. Deadlock checking
p LWLockArray[some_lock]In below sample, FileRep recovery process is waiting for ProcArrayLock.
#0 0x000000374c6d5887 in semop () from /lib64/libc.so.6
#1 0x0000000000853af8 in PGSemaphoreLock (sema=0x2b5ec70f3fe0, interruptOK=0 '\000') at pg_sema.c:437
#2 0x00000000008ee270 in LWLockAcquire (lockid=ProcArrayLock, mode=<value optimized out>) at lwlock.c:561
#3 0x00000000008ce687 in ProcArrayRemove (proc=0x2b5ec70f3fd0, forPrepare=0 '\000', isCommit=1 '\001') at procarray.c:170
#4 0x00000000008caadd in shmem_exit (code=0) at ipc.c:205
#5 proc_exit_prepare (code=0) at ipc.c:165
#6 proc_exit (code=0) at ipc.c:93
#7 0x0000000000b42117 in FileRepSubProcess_Main () at cdbfilerepservice.c:862
#8 0x0000000000b38d3a in FileRep_StartChildProcess (type=FileRepProcessTypePrimaryRecovery) at cdbfilerep.c:2538
#9 0x0000000000b3ff61 in FileRep_Main () at cdbfilerep.c:3415
#10 0x0000000000593449 in AuxiliaryProcessMain (argc=7, argv=0x7fff5741bec0) at bootstrap.c:486
#11 0x0000000000868544 in StartChildProcess (type=<value optimized out>) at postmaster.c:7741
#12 0x000000000086872c in StartFilerepProcesses () at postmaster.c:1648
#13 0x000000000087a11d in stepDoFilerepStartup (isInShutdown=<value optimized out>) at primary_mirror_mode.c:1729
#14 applyStepForTransitionToPrimarySegmentMode (isInShutdown=<value optimized out>) at primary_mirror_mode.c:2013
#15 doRequestedPrimaryMirrorModeTransitions (isInShutdown=<value optimized out>) at primary_mirror_mode.c:1616
#16 0x00000000008743ca in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2355
#17 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1546
#18 0x00000000007796ba in main (argc=15, argv=0x1585f030) at main.c:212
In below sample, this user process is waiting for WALWriteLock, and is holding WALInsertLock.(gdb) bt #0 0x000000374c6d5887 in semop () from /lib64/libc.so.6 #1 0x0000000000853af8 in PGSemaphoreLock (sema=0x2b5ec7251e90, interruptOK=0 '\000') at pg_sema.c:437 #2 0x00000000008ee270 in LWLockAcquire (lockid=WALWriteLock, mode=<value optimized out>) at lwlock.c:561 #3 0x000000000050a80c in AdvanceXLInsertBuffer (rmid=<value optimized out>, info=<value optimized out>, rdata=0x7fff5741b5c0, headerXid=<value optimized out>) at xlog.c:1751 #4 XLogInsert_Internal (rmid=<value optimized out>, info=<value optimized out>, rdata=0x7fff5741b5c0, headerXid=<value optimized out>) at xlog.c:1276 #5 0x000000000050bd64 in XLogInsert (rmid=83 'S', info=128 '\200', rdata=0x7fff5741b820) at xlog.c:757 #6 0x00000000004ce79a in heap_log_tuple_insert (relation=0x159d9960, tup=0x16249720) at heapam.c:2087 #7 heap_insert (relation=0x159d9960, tup=0x16249720) at heapam.c:2205 #8 simple_heap_insert (relation=0x159d9960, tup=0x16249720) at heapam.c:2254 #9 0x00000000005d0ae6 in add_attribute_encoding_entry (rel=<value optimized out>, attr_encodings=<value optimized out>) at pg_attribute_encoding.c:51 #10 AddRelationAttributeEncodings (rel=<value optimized out>, attr_encodings=<value optimized out>) at pg_attribute_encoding.c:359 #11 0x000000000068896f in ATExecAddColumn (tab=<value optimized out>, rel=0x15945690, colDef=0x1593a7e0) at tablecmds.c:6841 #12 0x00000000006a9b6f in ATExecCmd (wqueue=<value optimized out>, tab=0x15982e30, rel=0x15945690, cmd=0xffffffffffffffff) at tablecmds.c:5196 #13 0x00000000006aa9ce in ATRewriteCatalogs (rel=<value optimized out>, cmds=<value optimized out>, recurse=<value optimized out>, oidInfoCount=<value optimized out>, oidInfo=<value optimized out>, poidmap=<value optimized out>) at tablecmds.c:5054 #14 ATController (rel=<value optimized out>, cmds=<value optimized out>, recurse=<value optimized out>, oidInfoCount=<value optimized out>, oidInfo=<value optimized out>, poidmap=<value optimized out>) at tablecmds.c:3868 #15 0x00000000006abd1a in AlterTable (stmt=0x1593a6a8) at tablecmds.c:3795 #16 0x000000000090d184 in ProcessUtility (parsetree=0x1593a6a8, queryString=0x7fff5741b480 "\001", params=0x1, isTopLevel=1 '\001', dest=0x1593aa78, completionTag=0x7fff5741c0e0 "") at utility.c:1101 #17 0x000000000090a8b0 in PortalRunUtility (portal=0x1598ff00, count=<value optimized out>, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>, completionTag=<value optimized out>) at pquery.c:1524 #18 PortalRunMulti (portal=0x1598ff00, count=<value optimized out>, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>, completionTag=<value optimized out>) at pquery.c:1634 #19 PortalRun (portal=0x1598ff00, count=<value optimized out>, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>, completionTag=<value optimized out>) at pquery.c:1151 #20 0x0000000000902339 in exec_mpp_query (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:1290 #21 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:4873 #22 0x000000000086d371 in BackendRun (port=0x158c0c50) at postmaster.c:6779 #23 BackendStartup (port=0x158c0c50) at postmaster.c:6380 #24 0x0000000000873a20 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2334 #25 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1546 #26 0x00000000007796ba in main (argc=15, argv=0x1585f030) at main.c:212 (gdb) p held_lwlocks $1 = {MirroredLock, 1746, WALInsertLock, NullLock <repeats 97 times>} (gdb) p LWLockArray[WALInsertLock] $2 = {lock = {mutex = 0 '\000', releaseOK = 1 '\001', exclusive = 1 '\001', shared = 0, exclusivePid = 2376, head = 0x2b5ec7255900, tail = 0x2b5ec7255900}, pad = "\000\001\001\000\000\000\000\000H\t\000\000\000\000\000\000\000Y%\307^+\000\000\000Y%\307^+\000"}In below sample, this process is waiting for WALInsertLock, and is holding ProcArrayLock.
(gdb) bt #0 0x000000374c6d5887 in semop () from /lib64/libc.so.6 #1 0x0000000000853af8 in PGSemaphoreLock (sema=0x2b5ec7255910, interruptOK=0 '\000') at pg_sema.c:437 #2 0x00000000008ee270 in LWLockAcquire (lockid=WALInsertLock, mode=<value optimized out>) at lwlock.c:561 #3 0x000000000050a316 in XLogInsert_Internal (rmid=<value optimized out>, info=<value optimized out>, rdata=<value optimized out>, headerXid=<value optimized out>) at xlog.c:1025 #4 0x000000000050bd64 in XLogInsert (rmid=83 'S', info=176 '\260', rdata=0x7fff5741bd50) at xlog.c:757 #5 0x00000000004ef801 in DistributedLog_WriteZeroPageXlogRec (newestXact=141090816) at distributedlog.c:799 #6 DistributedLog_ZeroPage (newestXact=141090816) at distributedlog.c:464 #7 DistributedLog_Extend (newestXact=141090816) at distributedlog.c:679 #8 0x00000000004f1475 in GetNewTransactionId (isSubXact=0 '\000', setProcXid=0 '\000') at varsup.c:119 #9 0x0000000000b8cfa9 in LocalDistribXact_Start (newDistribTimeStamp=<value optimized out>, newDistribXid=1892192, newLocalXid=0x10b2544) at cdblocaldistribxact.c:433 #10 LocalDistribXact_StartOnSegment (newDistribTimeStamp=<value optimized out>, newDistribXid=1892192, newLocalXid=0x10b2544) at cdblocaldistribxact.c:498 #11 0x00000000004fa228 in StartTransaction () at xact.c:2913 #12 0x00000000004fa315 in StartTransactionCommand () at xact.c:4145 #13 0x0000000000c028c4 in doQEDistributedExplicitBegin (dtxContextInfo=0x1094fc0) at cdbtm.c:3512 #14 setupQEDtxContext (dtxContextInfo=0x1094fc0) at cdbtm.c:3776 #15 0x0000000000901b9a in PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:4840 #16 0x000000000086d371 in BackendRun (port=0x158c0c50) at postmaster.c:6779 #17 BackendStartup (port=0x158c0c50) at postmaster.c:6380 #18 0x0000000000873a20 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2334 #19 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1546 #20 0x00000000007796ba in main (argc=15, argv=0x1585f030) at main.c:212 (gdb) p held_lwlocks $1 = {MirroredLock, ProcArrayLock, XidGenLock, DistributedLogControlLock, NullLock <repeats 96 times>} (gdb) p LWLockArray[ProcArrayLock] $2 = {lock = {mutex = 0 '\000', releaseOK = 1 '\001', exclusive = 1 '\001', shared = 0, exclusivePid = 24290, head = 0x2b5ec7253880, tail = 0x2b5ec7193c80}, pad = "\000\001\001\000\000\000\000\000\342^\000\000\000\000\000\000\200\070%\307^+\000\000\200<\031\307^+\000"}
4. Show alive File replication sub processes of "primary" process of primary segment.
p FileRepSubProcListIn below sample, gdb is run on "primary" process – 32390.
[gpadmin@sdw15 ~]$ ps -ef|grep 48722 gpadmin 31303 28480 0 06:19 pts/0 00:00:00 grep 48722 gpadmin 32356 1 0 Apr05 ? 00:05:00 /usr/local/xxx_db/greenplum-db-4.2.2.1/bin/postgres -D /data/xxx/primary/gpseg9 -p 48722 -b 20 -z 18 --silent-mode=true -i -M quiescent -C 9 gpadmin 32376 32356 0 Apr05 ? 00:00:17 postgres: port 48722, logger process gpadmin 32390 32356 0 Apr05 ? 00:02:44 postgres: port 48722, primary process gpadmin 32402 32390 0 Apr05 ? 00:00:44 postgres: port 48722, primary receiver ack process gpadmin 32406 32390 0 Apr05 ? 00:04:12 postgres: port 48722, primary sender process gpadmin 32416 32390 0 Apr05 ? 00:00:36 postgres: port 48722, primary consumer ack process gpadmin 32418 32390 0 Apr05 ? 00:00:00 postgres: port 48722, primary recovery process gpadmin 32420 32390 0 Apr05 ? 00:00:00 postgres: port 48722, primary verification process gpadmin 32444 32356 0 Apr05 ? 00:00:00 postgres: port 48722, stats collector process gpadmin 32445 32356 0 Apr05 ? 00:00:03 postgres: port 48722, writer process gpadmin 32446 32356 0 Apr05 ? 00:00:21 postgres: port 48722, checkpoint process gpadmin 32447 32356 0 Apr05 ? 00:00:02 postgres: port 48722, sweeper process gpadmin 32448 32356 0 Apr05 ? 00:00:01 postgres: port 48722, stats sender process [gpadmin@sdw15 ~]$ gdb /usr/local/xxx_db/greenplum-db-4.2.2.1/bin/postgres 32390 (gdb) p FileRepSubProcList $1 = {{pid = 0, fileRepProcessType = FileRepProcessTypeNotInitialized}, {pid = 0, fileRepProcessType = FileRepProcessTypeMain}, { pid = 32406, fileRepProcessType = FileRepProcessTypePrimarySender}, {pid = 0, fileRepProcessType = FileRepProcessTypeMirrorReceiver}, { pid = 0, fileRepProcessType = FileRepProcessTypeMirrorConsumer}, {pid = 0, fileRepProcessType = FileRepProcessTypeMirrorSenderAck}, { pid = 32402, fileRepProcessType = FileRepProcessTypePrimaryReceiverAck}, {pid = 32416, fileRepProcessType = FileRepProcessTypePrimaryConsumerAck}, {pid = 32418, fileRepProcessType = FileRepProcessTypePrimaryRecovery}, { pid = 0, fileRepProcessType = FileRepProcessTypeMirrorConsumerWriter}, {pid = 0, fileRepProcessType = FileRepProcessTypeMirrorConsumerAppendOnly1}, {pid = 0, fileRepProcessType = FileRepProcessTypeResyncManager}, { pid = 0, fileRepProcessType = FileRepProcessTypeResyncWorker1}, {pid = 0, fileRepProcessType = FileRepProcessTypeResyncWorker2}, { pid = 0, fileRepProcessType = FileRepProcessTypeResyncWorker3}, {pid = 0, fileRepProcessType = FileRepProcessTypeResyncWorker4}, { pid = 32420, fileRepProcessType = FileRepProcessTypePrimaryVerification}, {pid = 0, fileRepProcessType = FileRepProcessTypeMirrorVerification}}
5. Print current SQL
p debug_query_string
eg:
(gdb) p debug_query_string $1 = 0xcb592a0 "delete from test ;"
6. Print current session and segment
p gp_session_id p Gp_segmenteg:
(gdb) p gp_session_id $6 = 37 (gdb) p Gp_segment $7 = -1
7. Print local params
info locals
eg:
(gdb) f 4
#4 0x00000000008e2f48 in LockAcquire (locktag=0x7fff3ce436a0, lockmode=<value optimized out>, sessionLock=<value optimized out>,
dontWait=<value optimized out>) at lock.c:1022
1022 lock.c: No such file or directory.
in lock.c
(gdb) info locals
lockmethodid = <value optimized out>
lockMethodTable = 0xdb02a0
localtag = {lock = {locktag_field1 = 17137, locktag_field2 = 17028, locktag_field3 = 0, locktag_field4 = 0, locktag_type = 0 '\000',
locktag_lockmethodid = 1 '\001'}, mode = 7}
locallock = 0xcb07250
lock = 0x2b541ba2da48
proclock = 0x2b541c08b770
proclocktag = {myLock = 0x2b541ba2da48, myProc = 0x2b541e91f200}
found = 0 '\000'
owner = 0xca83608
hashcode = 2660616842
proclock_hashcode = 2005592714
partitionLock = 227
status = -4
__func__ = "LockAcquire"
8. pg_cancel_backend reset postmaster issue(Fixed in latest version.)
(gdb) p PostmasterMain $1 = {int (int, char **)} 0x879490 <PostmasterMain> (gdb) disas GetCurrentCommandId Dump of assembler code for function GetCurrentCommandId: 0x00000000004f3290 <GetCurrentCommandId+0>: push %rbp 0x00000000004f3291 <GetCurrentCommandId+1>: mov 0xbe903d(%rip),%eax # 0x10dc2d4 <currentCommandId> 0x00000000004f3297 <GetCurrentCommandId+7>: mov %rsp,%rbp 0x00000000004f329a <GetCurrentCommandId+10>: leaveq 0x00000000004f329b <GetCurrentCommandId+11>: retq End of assembler dump. (gdb) p errordata[0] $2 = { elevel = 20, output_to_server = 1 '\001', output_to_client = 1 '\001', show_funcname = 0 '\000', omit_location = 1 '\001', fatal_return = 0 '\000', hide_stmt = 0 '\000', send_alert = 0 '\000', filename = 0xdba330 "postgres.c", lineno = 3473, funcname = 0xdb6300 "ProcessInterrupts", domain = 0xd8b9d0 "postgres-8.2", sqlerrcode = 67371461, message = 0x1c33d4c0 "canceling statement due to user request", detail = 0x0, detail_log = 0x0, hint = 0x0, context = 0x0, cursorpos = 0, internalpos = 0, internalquery = 0x0, saved_errno = 32, stacktracearray = {0xa71f35, 0x907e5c, 0xb19605, 0xb00b66, 0x726649, 0x6f1bf5, 0x6ee8d4, 0x6f00d0, 0x915529, 0x909dc5, 0x90b6e7, 0x8746d1, 0x87ad00, 0x77ff3a, 0x2b1585f3c9c4, 0x47c449, 0x0 <repeats 14 times>}, stacktracesize = 16 } (gdb) info threads 3 Thread 26077 0x00002b1585feb116 in poll () from /lib64/libc.so.6 2 Thread 26261 0x00002b1585fed1d2 in select () from /lib64/libc.so.6 * 1 Thread 0x2b158833fbe0 (LWP 26076) 0x00002b1585f4f182 in __longjmp () from /lib64/libc.so.6 (gdb) p gp_crash_handler_async $3 = 0 '\000' (gdb) thread 2 [Switching to thread 2 (Thread 26261)]#0 0x00002b1585fed1d2 in select () from /lib64/libc.so.6 (gdb) p gp_crash_handler_async $4 = 0 '\000' (gdb) p error_context_stack $5 = (ErrorContextCallback *) 0x0 (gdb) p *CurrentMemoryContext $6 = { type = T_AllocSetContext, methods = { alloc = 0xaaae60 <AllocSetAlloc>, free_p = 0xaab3d0 <AllocSetFree>, realloc = 0xaabee0 <AllocSetRealloc>, init = 0xaaad20 <AllocSetInit>, reset = 0xaab2b0 <AllocSetReset>, delete_context = 0xaab1c0 <AllocSetDelete>, get_chunk_space = 0xaaad30 <AllocSetGetChunkSpace>, is_empty = 0xaaad40 <AllocSetIsEmpty>, stats = 0xaab4c0 <AllocSetStats> }, parent = 0x1c33a6f0, firstchild = 0x0, nextchild = 0x0, name = 0x1c33ace8 "ErrorContext", allBytesAlloc = 8192, allBytesFreed = 0, maxBytesHeld = 8192, localMinHeld = 0 }
9. Find lock holder information using the core of lock waiter
Assume you have the cores of a waiting session and it is waiting for "LockAcquire".How to find the lock holder information using the core? Answer is :
p *((PROCLOCK*) (ShmemBase+((unsigned long) lock->procLocks.next) - ((long) &((PROCLOCK*)0)->lockLink)))->tag.myProcFor example:
(gdb) bt
#0 0x0000003aae6d5887 in semop () from /lib64/libc.so.6
#1 0x000000000085ad98 in PGSemaphoreLock (sema=0x2b589d58d890, interruptOK=1 '\001') at pg_sema.c:437
#2 0x00000000008e812e in ProcSleep (locallock=<value optimized out>, lockMethodTable=<value optimized out>) at proc.c:1004
#3 0x00000000008e15e2 in WaitOnLock (locallock=<value optimized out>, owner=<value optimized out>) at lock.c:1378
#4 0x00000000008e2f48 in LockAcquire (locktag=0x7fff883480f0, lockmode=<value optimized out>, sessionLock=<value optimized out>, dontWait=<value optimized out>) at lock.c:1022
#5 0x00000000008dfdcb in LockRelationOid (relid=59290, lockmode=8) at lmgr.c:84
#6 0x00000000004c62f6 in try_relation_open (relationId=59290, lockmode=8, noWait=1 '\001') at heapam.c:908
#7 0x00000000006d2663 in open_relation_and_check_permission (vacstmt=0x16fff88, relids=<value optimized out>) at vacuum.c:4029
#8 vacuumStatement (vacstmt=0x16fff88, relids=<value optimized out>) at vacuum.c:460
#9 0x00000000006d3496 in vacuum (vacstmt=0x16fff88, relids=<value optimized out>) at vacuum.c:285
#10 0x00000000009179be in ProcessUtility (parsetree=0x16fff88, queryString=<value optimized out>, params=0x1, isTopLevel=1 '\001', dest=0x1700308, completionTag=0x7fff88348680 "")
at utility.c:1377
#11 0x0000000000915390 in PortalRunUtility (portal=0x17036f0, count=<value optimized out>, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>,
completionTag=<value optimized out>) at pquery.c:1525
#12 PortalRunMulti (portal=0x17036f0, count=<value optimized out>, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>,
completionTag=<value optimized out>) at pquery.c:1635
#13 PortalRun (portal=0x17036f0, count=<value optimized out>, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>,
completionTag=<value optimized out>) at pquery.c:1152
#14 0x0000000000909e95 in exec_simple_query (query_string=<value optimized out>, seqServerHost=<value optimized out>, seqServerPort=<value optimized out>) at postgres.c:1721
#15 0x000000000090b7a7 in PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:4612
#16 0x00000000008747a1 in BackendRun (port=0x162e580) at postmaster.c:6785
#17 BackendStartup (port=0x162e580) at postmaster.c:6386
#18 0x000000000087add0 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2340
#19 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1546
#20 0x000000000078000a in main (argc=18, argv=0x1606030) at main.c:212
Lock information of current waiting session: pid=8637,mppSessionId=30.(gdb) f 4 #4 0x00000000008e2f48 in LockAcquire (locktag=0x7fff883480f0, lockmode=<value optimized out>, sessionLock=<value optimized out>, dontWait=<value optimized out>) at lock.c:1022 1022 lock.c: No such file or directory. in lock.c (gdb) p *proclock->tag.myProc $1 = {links = {prev = 140902152, next = 140902152}, sem = {semId = 354844776, semNum = 5}, waitStatus = 2, xid = 4933, localDistribXactRef = {eyecatcher = "LDX", index = 203}, xmin = 4929, pid = 8367, databaseId = 17128, roleId = 10, mppSessionId = 30, mppLocalProcessSerial = 30, mppIsWriter = 1 '\001', postmasterResetRequired = 1 '\001', inVacuum = 0 '\000', lwWaiting = 0 '\000', lwExclusive = 0 '\000', lwWaitLink = 0x0, waitLock = 0x2b589a69bee0, waitProcLock = 0x2b589acf9ad0, waitLockMode = 8, heldLocks = 0, myProcLocks = {{prev = 147577592, next = 147577592}, {prev = 190126328, next = 190126328}, {prev = 190126344, next = 190126344}, {prev = 190126360, next = 190126360}, { prev = 190126376, next = 190126376}, {prev = 190126392, next = 190126392}, {prev = 190126408, next = 190126408}, {prev = 190126424, next = 190126424}, {prev = 147577208, next = 147577208}, {prev = 190126456, next = 190126456}, {prev = 190126472, next = 190126472}, {prev = 190126488, next = 190126488}, {prev = 190126504, next = 190126504}, { prev = 190126520, next = 190126520}, {prev = 190126536, next = 190126536}, {prev = 190126552, next = 190126552}}, subxids = {overflowed = 0 '\000', nxids = 0, xids = { 0 <repeats 64 times>}}, waitPortalId = 4294967295, combocid_map_count = 15, combocid_map_name = "sess18_w24896_combocid_map", '\000' <repeats 997 times>, queryCommandId = 2}Lock information of locker session: pid=8741,mppSessionId = 28
(gdb) p *((PROCLOCK*) (ShmemBase+((unsigned long) lock->procLocks.next) - ((long) &((PROCLOCK*)0)->lockLink)))->tag.myProc $2 = {links = {prev = 18446744073709551615, next = 18446744073709551615}, sem = {semId = 354844776, semNum = 4}, waitStatus = 0, xid = 4929, localDistribXactRef = {eyecatcher = "LDX", index = 200}, xmin = 4929, pid = 8741, databaseId = 17128, roleId = 10, mppSessionId = 28, mppLocalProcessSerial = 28, mppIsWriter = 1 '\001', postmasterResetRequired = 1 '\001', inVacuum = 0 '\000', lwWaiting = 0 '\000', lwExclusive = 0 '\000', lwWaitLink = 0x0, waitLock = 0x0, waitProcLock = 0x0, waitLockMode = 8, heldLocks = 0, myProcLocks = {{ prev = 147577688, next = 147577688}, {prev = 190124664, next = 190124664}, {prev = 190124680, next = 190124680}, {prev = 190124696, next = 190124696}, {prev = 190124712, next = 190124712}, {prev = 190124728, next = 190124728}, {prev = 147577400, next = 147577400}, {prev = 190124760, next = 190124760}, {prev = 190124776, next = 190124776}, { prev = 190124792, next = 190124792}, {prev = 190124808, next = 190124808}, {prev = 190124824, next = 190124824}, {prev = 190124840, next = 190124840}, {prev = 190124856, next = 190124856}, {prev = 190124872, next = 190124872}, {prev = 190124888, next = 190124888}}, subxids = {overflowed = 0 '\000', nxids = 0, xids = {0 <repeats 64 times>}}, waitPortalId = 4294967295, combocid_map_count = 15, combocid_map_name = "sess28_w8741_combocid_map", '\000' <repeats 998 times>, queryCommandId = 194}
10. GPText queries hung in GPText 1.2.0.0 due to timeout. (Fixed in latest version)
Check the hung process of problematic session on one segment, it is waiting for "get_data" and err_buf shows "transfer closed with outstanding read data remaining".(gdb) bt #0 0x0000003c488cd722 in select () from /lib64/libc.so.6 #1 0x00002aaaab03da29 in get_data (state=0xcadad50) at solr_curl.c:630 #2 0x00002aaaab03c13d in get_next_result (fcinfo=0xc7e6e50) at solr_search.c:155 #3 solr_search (fcinfo=0xc7e6e50) at solr_search.c:982 #4 0x00000000007301b1 in TableFunctionNext (node=0xc7e3bc0) at nodeTableFunction.c:149 #5 0x00000000006f1b7e in ExecProcNode (node=0xc7e3bc0) at execProcnode.c:609 #6 0x0000000000726aa6 in execMotionSender (node=0x2aaaaae115c8) at nodeMotion.c:348 #7 ExecMotion (node=0x2aaaaae115c8) at nodeMotion.c:315 #8 0x00000000006f1cd5 in ExecProcNode (node=0x2aaaaae115c8) at execProcnode.c:657 #9 0x00000000006ee9b4 in ExecutePlan (estate=0xc408d50, planstate=<value optimized out>, operation=<value optimized out>, numberTuples=<value optimized out>, direction=<value optimized out>, dest=<value optimized out>) at execMain.c:2361 #10 0x00000000006f015e in ExecutorRun (queryDesc=<value optimized out>, direction=<value optimized out>, count=<value optimized out>) at execMain.c:862 #11 0x00000000009170b9 in PortalRunSelect (portal=<value optimized out>, count=0, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>, completionTag=<value optimized out>) at pquery.c:1283 #12 PortalRun (portal=<value optimized out>, count=0, isTopLevel=<value optimized out>, dest=<value optimized out>, altdest=<value optimized out>, completionTag=<value optimized out>) at pquery.c:1105 #13 0x000000000090f839 in exec_mpp_query (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:1291 #14 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:4826 #15 0x0000000000875ac1 in BackendRun (port=0xc2e85e0) at postmaster.c:6801 #16 BackendStartup (port=0xc2e85e0) at postmaster.c:6402 #17 0x000000000087bc00 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2340 #18 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1546 #19 0x0000000000780ada in main (argc=15, argv=0xc2b8030) at main.c:212 (gdb) p *state $6 = {curl = 0xcaf9100, curlm = 0xcb01db0, still_running = 0, timeout = 600, curl_ret = 0, curlm_ret = 0, http_rc = 200, err_buf = 0xcadade0 "transfer closed with outstanding read data remaining", headers = 0xcaf6640, context = 0xcac9940, buf = { data = 0x2aaaab251040 "\"__pk\",score\n7918990,1.0\n7919038,1.0\n7924704,1.0\n7925415,1.0\n7919604,1.0\n7923455,1.0\n7919183,1.0\n7923343,1.0\n7922114,1.0\n7919279,1.0\n7920769,1.0\n7924395,1.0\n7920576,1.0\n7923102,1.0\n7923375,1.0\n7921046"..., send_data_end = 0x2aaaab251040 "\"__pk\",score\n7918990,1.0\n7919038,1.0\n7924704,1.0\n7925415,1.0\n7919604,1.0\n7923455,1.0\n7919183,1.0\n7923343,1.0\n7922114,1.0\n7919279,1.0\n7920769,1.0\n7924395,1.0\n7920576,1.0\n7923102,1.0\n7923375,1.0\n7921046"..., bottom = 0x2aaaab28d035 "8660266,1.0", top = 0x2aaaab28d040 "", size = 4194304}, post_data = 0xcadaef0 "q=%2A%3A%2A&rows=1000000&fl=__pk%2Cscore&wt=csv&csv.header=true&start=0", post_data_size = 71}
Note: All above stuff are deep dive into core dumps of Greenplum processes using gdb.
Thanks - some really good information here. :)
ReplyDelete